diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index 792e2e07ec594..37e6250e7c587 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -5253,3 +5253,11 @@ def CountedByRef : Builtin { let Attributes = [NoThrow, CustomTypeChecking]; let Prototype = "int(...)"; } + +// Constant-time select builtin +def CtSelect : Builtin { + let Spellings = ["__builtin_ct_select"]; + let Attributes = [NoThrow, UnevaluatedArguments, + ConstIgnoringExceptions, CustomTypeChecking]; + let Prototype = "void(...)"; +} diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index 18641a96063cd..17993660ba395 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -872,6 +872,7 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const { .Case("ssve-fp8fma", HasSSVE_FP8FMA) .Case("sme-f8f32", HasSME_F8F32) .Case("sme-f8f16", HasSME_F8F16) + .Case("ctselect", true) .Default(false); } diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp index 3de17d2c829f1..423a7b8749658 100644 --- a/clang/lib/Basic/Targets/ARM.cpp +++ b/clang/lib/Basic/Targets/ARM.cpp @@ -664,6 +664,7 @@ bool ARMTargetInfo::hasFeature(StringRef Feature) const { .Case("hwdiv", HWDiv & HWDivThumb) .Case("hwdiv-arm", HWDiv & HWDivARM) .Case("mve", hasMVE()) + .Case("ctselect", true) .Default(false); } diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index e71f10c4c16fc..45fa0379783fe 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -1298,6 +1298,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { .Case("cf", HasCF) .Case("zu", HasZU) .Case("branch-hint", HasBranchHint) + .Case("ctselect", true) .Default(false); } diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 92dba32698e51..25b95ce0289b7 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -26,6 +26,10 @@ #include "TargetInfo.h" #include "clang/AST/OSLog.h" #include "clang/AST/StmtVisitor.h" +#include "clang/AST/OperationKinds.h" +#include "clang/AST/Type.h" +#include "clang/Basic/DiagnosticSema.h" +#include "clang/Basic/TargetBuiltins.h" #include "clang/Basic/TargetInfo.h" #include "clang/Frontend/FrontendDiagnostic.h" #include "llvm/IR/InlineAsm.h" @@ -6441,6 +6445,40 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, auto Str = CGM.GetAddrOfConstantCString(Name, ""); return RValue::get(Str.getPointer()); } + case Builtin::BI__builtin_ct_select: { + if (E->getNumArgs() != 3) { + CGM.getDiags().Report(E->getBeginLoc(), + E->getNumArgs() > 3 + ? diag::err_typecheck_call_too_many_args + : diag::err_typecheck_call_too_few_args); + return GetUndefRValue(E->getType()); + } + + auto *Cond = EmitScalarExpr(E->getArg(0)); + auto *A = EmitScalarExpr(E->getArg(1)); + auto *B = EmitScalarExpr(E->getArg(2)); + + // Verify types match + if (A->getType() != B->getType()) { + CGM.getDiags().Report(E->getBeginLoc(), + diag::err_typecheck_convert_incompatible); + return GetUndefRValue(E->getType()); + } + + // Verify condition is integer type + if (!Cond->getType()->isIntegerTy()) { + CGM.getDiags().Report(E->getBeginLoc(), diag::err_typecheck_expect_int); + return GetUndefRValue(E->getType()); + } + + if (Cond->getType()->getIntegerBitWidth() != 1) + Cond = Builder.CreateICmpNE( + Cond, llvm::ConstantInt::get(Cond->getType(), 0), "cond.bool"); + + llvm::Function *Fn = + CGM.getIntrinsic(llvm::Intrinsic::ct_select, {A->getType()}); + return RValue::get(Builder.CreateCall(Fn, {Cond, A, B})); + } } // If this is an alias for a lib function (e.g. __builtin_sin), emit diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 652527a88b160..d7c283367353c 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3472,6 +3472,93 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID, if (BuiltinCountedByRef(TheCall)) return ExprError(); break; + + case Builtin::BI__builtin_ct_select: { + if (TheCall->getNumArgs() != 3) { + // Simple argument count check without complex diagnostics + if (TheCall->getNumArgs() < 3) { + return Diag(TheCall->getEndLoc(), diag::err_typecheck_call_too_few_args_at_least) + << 0 << 3 << TheCall->getNumArgs() << 0 + << TheCall->getCallee()->getSourceRange(); + } else { + return Diag(TheCall->getEndLoc(), diag::err_typecheck_call_too_many_args) + << 0 << 3 << TheCall->getNumArgs() << 0 + << TheCall->getCallee()->getSourceRange(); + } + } + auto *Cond = TheCall->getArg(0); + auto *A = TheCall->getArg(1); + auto *B = TheCall->getArg(2); + + QualType CondTy = Cond->getType(); + if (!CondTy->isIntegerType()) { + return Diag(Cond->getBeginLoc(), diag::err_typecheck_cond_expect_scalar) + << CondTy << Cond->getSourceRange(); + } + + QualType ATy = A->getType(); + QualType BTy = B->getType(); + + // check for scalar or vector scalar type + if ((!ATy->isScalarType() && !ATy->isVectorType()) || + (!BTy->isScalarType() && !BTy->isVectorType())) { + return Diag(A->getBeginLoc(), + diag::err_typecheck_cond_incompatible_operands) + << ATy << BTy << A->getSourceRange() << B->getSourceRange(); + } + + // Check if both operands have the same type or can be implicitly converted + QualType ResultTy; + if (Context.hasSameType(ATy, BTy)) { + ResultTy = ATy; + } else { + // Try to find a common type using the same logic as conditional + // expressions + ExprResult ARes = ExprResult(A); + ExprResult BRes = ExprResult(B); + + // For arithmetic types, allow promotions within the same category only + if (ATy->isArithmeticType() && BTy->isArithmeticType()) { + // Check if both are integer types or both are floating types + bool AIsInteger = ATy->isIntegerType(); + bool BIsInteger = BTy->isIntegerType(); + bool AIsFloating = ATy->isFloatingType(); + bool BIsFloating = BTy->isFloatingType(); + + if ((AIsInteger && BIsInteger) || (AIsFloating && BIsFloating)) { + // Both are in the same category, allow usual arithmetic conversions + ResultTy = UsualArithmeticConversions( + ARes, BRes, TheCall->getBeginLoc(), ArithConvKind::Conditional); + if (ARes.isInvalid() || BRes.isInvalid() || ResultTy.isNull()) { + return Diag(A->getBeginLoc(), + diag::err_typecheck_cond_incompatible_operands) + << ATy << BTy << A->getSourceRange() << B->getSourceRange(); + } + // Update the arguments with any necessary implicit casts + TheCall->setArg(1, ARes.get()); + TheCall->setArg(2, BRes.get()); + } else { + // Different categories (int vs float), not allowed + return Diag(A->getBeginLoc(), + diag::err_typecheck_cond_incompatible_operands) + << ATy << BTy << A->getSourceRange() << B->getSourceRange(); + } + } else { + // For non-arithmetic types, they must be exactly the same + return Diag(A->getBeginLoc(), + diag::err_typecheck_cond_incompatible_operands) + << ATy << BTy << A->getSourceRange() << B->getSourceRange(); + } + } + + ExprResult CondRes = PerformContextuallyConvertToBool(Cond); + if (CondRes.isInvalid()) + return ExprError(); + + TheCall->setArg(0, CondRes.get()); + TheCall->setType(ResultTy); + return TheCall; + } break; } if (getLangOpts().HLSL && HLSL().CheckBuiltinFunctionCall(BuiltinID, TheCall)) diff --git a/clang/test/Sema/builtin-ct-select-edge-cases.c b/clang/test/Sema/builtin-ct-select-edge-cases.c new file mode 100644 index 0000000000000..3998e9d68748d --- /dev/null +++ b/clang/test/Sema/builtin-ct-select-edge-cases.c @@ -0,0 +1,384 @@ +// RUN: %clang_cc1 -fsyntax-only -verify %s +// RUN: %clang_cc1 -fsyntax-only -verify %s -fexperimental-new-constant-interpreter + +// Test with various condition expressions +int test_conditional_expressions(int x, int y, int a, int b) { + // Logical expressions + int result1 = __builtin_ct_select(x && y, a, b); + int result2 = __builtin_ct_select(x || y, a, b); + int result3 = __builtin_ct_select(!x, a, b); + + // Comparison expressions + int result4 = __builtin_ct_select(x == y, a, b); + int result5 = __builtin_ct_select(x != y, a, b); + int result6 = __builtin_ct_select(x < y, a, b); + int result7 = __builtin_ct_select(x > y, a, b); + int result8 = __builtin_ct_select(x <= y, a, b); + int result9 = __builtin_ct_select(x >= y, a, b); + + // Bitwise expressions + int result10 = __builtin_ct_select(x & y, a, b); + int result11 = __builtin_ct_select(x | y, a, b); + int result12 = __builtin_ct_select(x ^ y, a, b); + int result13 = __builtin_ct_select(~x, a, b); + + // Arithmetic expressions + int result14 = __builtin_ct_select(x + y, a, b); + int result15 = __builtin_ct_select(x - y, a, b); + int result16 = __builtin_ct_select(x * y, a, b); + int result17 = __builtin_ct_select(x / y, a, b); + int result18 = __builtin_ct_select(x % y, a, b); + + return result1 + result2 + result3 + result4 + result5 + result6 + result7 + result8 + result9 + result10 + result11 + result12 + result13 + result14 + result15 + result16 + result17 + result18; +} + +// Test with extreme values +int test_extreme_values(int cond) { + // Maximum and minimum values + int max_int = __builtin_ct_select(cond, __INT_MAX__, -__INT_MAX__ - 1); + + // Very large numbers + long long max_ll = __builtin_ct_select(cond, __LONG_LONG_MAX__, -__LONG_LONG_MAX__ - 1); + + // Floating point extremes + float max_float = __builtin_ct_select(cond, __FLT_MAX__, -__FLT_MAX__); + double max_double = __builtin_ct_select(cond, __DBL_MAX__, -__DBL_MAX__); + + return max_int; +} + +// Test with zero and negative zero +int test_zero_values(int cond) { + // Integer zeros + int zero_int = __builtin_ct_select(cond, 0, -0); + + // Floating point zeros + float zero_float = __builtin_ct_select(cond, 0.0f, -0.0f); + double zero_double = __builtin_ct_select(cond, 0.0, -0.0); + + return zero_int; +} + +// Test with infinity and NaN +int test_special_float_values(int cond) { + // Infinity + float inf_float = __builtin_ct_select(cond, __builtin_inff(), -__builtin_inff()); + double inf_double = __builtin_ct_select(cond, __builtin_inf(), -__builtin_inf()); + + // NaN + float nan_float = __builtin_ct_select(cond, __builtin_nanf(""), __builtin_nanf("")); + double nan_double = __builtin_ct_select(cond, __builtin_nan(""), __builtin_nan("")); + + return 0; +} + +// Test with complex pointer scenarios +int test_pointer_edge_cases(int cond) { + int arr[10]; + int *ptr1 = arr; + int *ptr2 = arr + 5; + + // Array pointers + int *result1 = __builtin_ct_select(cond, ptr1, ptr2); + + // Pointer arithmetic + int *result2 = __builtin_ct_select(cond, arr + 1, arr + 2); + + // NULL vs non-NULL + int *result3 = __builtin_ct_select(cond, ptr1, (int*)0); + + // Different pointer types (should fail) + float *fptr = (float*)0; + int *result4 = __builtin_ct_select(cond, ptr1, fptr); // expected-error {{incompatible operand types ('int *' and 'float *')}} + + return *result1; +} + +// Test with function pointers +int func1(int x) { return x; } +int func2(int x) { return x * 2; } +float func3(float x) { return x; } + +int test_function_pointers(int cond, int x) { + // Same signature function pointer + int (*fptr)(int) = __builtin_ct_select(cond, &func1, &func2); + + // Different signature function pointers (should fail) + int (*bad_fptr)(int) = __builtin_ct_select(cond, &func1, &func3); // expected-error {{incompatible operand types ('int (*)(int)' and 'float (*)(float)')}} + + return fptr(x); +} + +// Test with void pointers +void *test_void_pointers(int cond, void *a, void *b) { + return __builtin_ct_select(cond, a, b); +} + +// Test with const/volatile qualifiers +int test_qualifiers(int cond) { + const int ca = 10; + const int cb = 20; + volatile int va = 30; + volatile int vb = 40; + const volatile int cva = 50; + const volatile int cvb = 60; + + // const to const + const int result1 = __builtin_ct_select(cond, ca, cb); + + // volatile to volatile + volatile int result2 = __builtin_ct_select(cond, va, vb); + + // const volatile to const volatile + const volatile int result3 = __builtin_ct_select(cond, cva, cvb); + + return result1 + result2 + result3; +} + +// Test with arrays (should fail as they're not arithmetic or pointer) +int test_arrays(int cond) { + int arr1[5] = {1, 2, 3, 4, 5}; + int arr2[5] = {6, 7, 8, 9, 10}; + + // This should fail?? + int *result = __builtin_ct_select(cond, arr1, arr2); // expected-error {{incompatible operand types ('int[5]' and 'int[5]')}} + + return result[0]; +} + +// Test with structures (should fail) +struct Point { + int x, y; +}; + +struct Point test_structs(int cond) { + struct Point p1 = {1, 2}; + struct Point p2 = {3, 4}; + + return __builtin_ct_select(cond, p1, p2); // expected-error {{incompatible operand types ('struct Point' and 'struct Point')}} +} + +// Test with unions (should fail) +union Data { + int i; + float f; +}; + +union Data test_unions(int cond) { + union Data d1 = {.i = 10}; + union Data d2 = {.i = 20}; + + return __builtin_ct_select(cond, d1, d2); // expected-error {{incompatible operand types ('union Data' and 'union Data')}} +} + +// Test with bit fields (should work as they're integers) +struct BitField { + int a : 4; + int b : 4; +}; + +int test_bit_fields(int cond) { + struct BitField bf1 = {1, 2}; + struct BitField bf2 = {3, 4}; + + // Individual bit fields should work + int result1 = __builtin_ct_select(cond, bf1.a, bf2.a); + int result2 = __builtin_ct_select(cond, bf1.b, bf2.b); + + return result1 + result2; +} + +// Test with designated initializers +int test_designated_init(int cond) { + int arr1[3] = {[0] = 1, [1] = 2, [2] = 3}; + int arr2[3] = {[0] = 4, [1] = 5, [2] = 6}; + + // Access specific elements + int result1 = __builtin_ct_select(cond, arr1[0], arr2[0]); + int result2 = __builtin_ct_select(cond, arr1[1], arr2[1]); + + return result1 + result2; +} + +// Test with complex expressions in arguments +int complex_expr(int x) { return x * x; } + +int test_complex_arguments(int cond, int x, int y) { + // Function calls as arguments + int result1 = __builtin_ct_select(cond, complex_expr(x), complex_expr(y)); + + // Ternary operator as arguments + int result2 = __builtin_ct_select(cond, x > 0 ? x : -x, y > 0 ? y : -y); + + // Compound literals + int result3 = __builtin_ct_select(cond, (int){x}, (int){y}); + + return result1 + result2 + result3; +} + +// Test with preprocessor macros +#define MACRO_A 42 +#define MACRO_B 24 +#define MACRO_COND(x) (x > 0) + +int test_macros(int x) { + int result1 = __builtin_ct_select(MACRO_COND(x), MACRO_A, MACRO_B); + + // Nested macros + #define NESTED_SELECT(c, a, b) __builtin_ct_select(c, a, b) + int result2 = NESTED_SELECT(x, 10, 20); + + return result1 + result2; +} + +// Test with string literals (should fail) +const char *test_strings(int cond) { + return __builtin_ct_select(cond, "hello", "world"); // expected-error {{incompatible operand types ('char[6]' and 'char[6]')}} +} + +// Test with variable length arrays (VLA) +int test_vla(int cond, int n) { + int vla1[n]; + int vla2[n]; + + // Individual elements should work + vla1[0] = 1; + vla2[0] = 2; + int result = __builtin_ct_select(cond, vla1[0], vla2[0]); + + return result; +} + +// Test with typedef +typedef int MyInt; +typedef float MyFloat; + +MyInt test_typedef(int cond, MyInt a, MyInt b) { + return __builtin_ct_select(cond, a, b); +} + +// Test with different typedef types (should fail) +MyInt test_different_typedef(int cond, MyInt a, MyFloat b) { + return __builtin_ct_select(cond, a, b); // expected-error {{incompatible operand types ('MyInt' (aka 'int') and 'MyFloat' (aka 'float'))}} +} + +// Test with side effects (should be evaluated) +int side_effect_counter = 0; +int side_effect_func(int x) { + side_effect_counter++; + return x; +} + +int test_side_effects(int cond) { + // Both arguments should be evaluated + int result = __builtin_ct_select(cond, side_effect_func(10), side_effect_func(20)); + return result; +} + +// Test with goto labels (context where expressions are used) +int test_goto_context(int cond, int a, int b) { + int result = __builtin_ct_select(cond, a, b); + + if (result > 0) { + goto positive; + } else { + goto negative; + } + +positive: + return result; + +negative: + return -result; +} + +// Test with switch statements +int test_switch_context(int cond, int a, int b) { + int result = __builtin_ct_select(cond, a, b); + + switch (result) { + case 0: + return 0; + case 1: + return 1; + default: + return -1; + } +} + +// Test with loops +int test_loop_context(int cond, int a, int b) { + int result = __builtin_ct_select(cond, a, b); + int sum = 0; + + for (int i = 0; i < result; i++) { + sum += i; + } + + return sum; +} + +// Test with recursive functions +int factorial(int n) { + if (n <= 1) return 1; + return n * factorial(n - 1); +} + +int test_recursive(int cond, int n) { + int result = __builtin_ct_select(cond, n, n + 1); + return factorial(result); +} + +// Test with inline functions +static inline int inline_func(int x) { + return x * 2; +} + +int test_inline(int cond, int a, int b) { + return __builtin_ct_select(cond, inline_func(a), inline_func(b)); +} + +// Test with static variables +int test_static_vars(int cond) { + static int static_a = 10; + static int static_b = 20; + + return __builtin_ct_select(cond, static_a, static_b); +} + +// Test with extern variables +extern int extern_a; +extern int extern_b; + +int test_extern_vars(int cond) { + return __builtin_ct_select(cond, extern_a, extern_b); +} + +// Test with register variables +int test_register_vars(int cond) { + register int reg_a = 30; + register int reg_b = 40; + + return __builtin_ct_select(cond, reg_a, reg_b); +} + +// Test with thread-local variables (C11) +#if __STDC_VERSION__ >= 201112L +_Thread_local int tls_a = 50; +_Thread_local int tls_b = 60; + +int test_tls_vars(int cond) { + return __builtin_ct_select(cond, tls_a, tls_b); +} +#endif + +// Test with atomic variables (C11) +#if __STDC_VERSION__ >= 201112L +#include +atomic_int atomic_a = 70; +atomic_int atomic_b = 80; + +int test_atomic_vars(int cond) { + return __builtin_ct_select(cond, atomic_a, atomic_b); // expected-error {{incompatible operand types ('atomic_int' (aka '_Atomic(int)') and 'atomic_int')}} +} +#endif diff --git a/clang/test/Sema/builtin-ct-select.c b/clang/test/Sema/builtin-ct-select.c new file mode 100644 index 0000000000000..7749eb52eecb3 --- /dev/null +++ b/clang/test/Sema/builtin-ct-select.c @@ -0,0 +1,683 @@ +// RUN: %clang_cc1 -emit-llvm -o - %s | FileCheck %s + +// Test integer types +int test_int(int cond, int a, int b) { + // CHECK-LABEL: define {{.*}} @test_int + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +long test_long(int cond, long a, long b) { + // CHECK-LABEL: define {{.*}} @test_long + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call i64 @llvm.ct.select.i64(i1 [[COND]], i64 %{{.*}}, i64 %{{.*}}) + // CHECK: ret i64 [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +short test_short(int cond, short a, short b) { + // CHECK-LABEL: define {{.*}} @test_short + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call i16 @llvm.ct.select.i16(i1 [[COND]], i16 %{{.*}}, i16 %{{.*}}) + // CHECK: ret i16 [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +unsigned char test_uchar(int cond, unsigned char a, unsigned char b) { + // CHECK-LABEL: define {{.*}} @test_uchar + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call i8 @llvm.ct.select.i8(i1 [[COND]], i8 %{{.*}}, i8 %{{.*}}) + // CHECK: ret i8 [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +long long test_longlong(int cond, long long a, long long b) { + // CHECK-LABEL: define {{.*}} @test_longlong + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call i64 @llvm.ct.select.i64(i1 [[COND]], i64 %{{.*}}, i64 %{{.*}}) + // CHECK: ret i64 [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +// Test floating point types +float test_float(int cond, float a, float b) { + // CHECK-LABEL: define {{.*}} @test_float + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call float @llvm.ct.select.f32(i1 [[COND]], float %{{.*}}, float %{{.*}}) + // CHECK: ret float [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +double test_double(int cond, double a, double b) { + // CHECK-LABEL: define {{.*}} @test_double + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call double @llvm.ct.select.f64(i1 [[COND]], double %{{.*}}, double %{{.*}}) + // CHECK: ret double [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +// Test pointer types +int *test_pointer(int cond, int *a, int *b) { + // CHECK-LABEL: define {{.*}} @test_pointer + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call ptr @llvm.ct.select.p0(i1 [[COND]], ptr %{{.*}}, ptr %{{.*}}) + // CHECK: ret ptr [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +// Test with different condition types +int test_char_cond(char cond, int a, int b) { + // CHECK-LABEL: define {{.*}} @test_char_cond + // CHECK: [[COND:%.*]] = icmp ne i8 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +int test_long_cond(long cond, int a, int b) { + // CHECK-LABEL: define {{.*}} @test_long_cond + // CHECK: [[COND:%.*]] = icmp ne i64 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +// Test with boolean condition +int test_bool_cond(_Bool cond, int a, int b) { + // CHECK-LABEL: define {{.*}} @test_bool_cond + // CHECK: [[COND:%.*]] = trunc i8 %{{.*}} to i1 + // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +// Test with constants +int test_constant_cond(void) { + // CHECK-LABEL: define {{.*}} @test_constant_cond + // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 true, i32 42, i32 24) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(1, 42, 24); +} + +int test_zero_cond(void) { + // CHECK-LABEL: define {{.*}} @test_zero_cond + // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 false, i32 42, i32 24) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(0, 42, 24); +} + +// Test type promotion +int test_promotion(int cond, short a, short b) { + // CHECK-LABEL: define {{.*}} @test_promotion + // CHECK-DAG: [[A_EXT:%.*]] = sext i16 %{{.*}} to i32 + // CHECK-DAG: [[B_EXT:%.*]] = sext i16 %{{.*}} to i32 + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 [[A_EXT]], i32 [[B_EXT]]) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(cond, (int)a, (int)b); +} + +// Test mixed signedness +unsigned int test_mixed_signedness(int cond, int a, unsigned int b) { + // CHECK-LABEL: define {{.*}} @test_mixed_signedness + // CHECK-DAG: [[A_EXT:%.*]] = sext i32 %{{.*}} to i64 + // CHECK-DAG: [[B_EXT:%.*]] = zext i32 %{{.*}} to i64 + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call i64 @llvm.ct.select.i64(i1 [[COND]], i64 [[A_EXT]], i64 [[B_EXT]]) + // CHECK: [[RESULT_TRUNC:%.*]] = trunc i64 [[RESULT]] to i32 + // CHECK: ret i32 [[RESULT_TRUNC]] + return __builtin_ct_select(cond, (long)a, (long)b); +} + +// Test complex expression +int test_complex_expr_alt(int x, int y) { + // CHECK-LABEL: define {{.*}} @test_complex_expr_alt + // CHECK-DAG: [[CMP:%.*]] = icmp sgt i32 %{{.*}}, 0 + // CHECK-DAG: [[ADD:%.*]] = add nsw i32 %{{.*}}, %{{.*}} + // CHECK-DAG: [[SUB:%.*]] = sub nsw i32 %{{.*}}, %{{.*}} + // Separate the final sequence to ensure proper ordering + // CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[CMP]], i32 [[ADD]], i32 [[SUB]]) + // CHECK-NEXT: ret i32 [[RESULT]] + return __builtin_ct_select(x > 0, x + y, x - y); +} + +// Test nested calls +int test_nested_structured(int cond1, int cond2, int a, int b, int c) { + // CHECK-LABEL: define {{.*}} @test_nested_structured + // Phase 1: Conditions (order doesn't matter) + // CHECK-DAG: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0 + + // Phase 2: Inner select (must happen before outer) + // CHECK: [[INNER:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND2]], i32 %{{.*}}, i32 %{{.*}}) + + // Phase 3: Outer select (must use inner result) + // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND1]], i32 [[INNER]], i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(cond1, __builtin_ct_select(cond2, a, b), c); +} + +// Test with function calls +int helper(int x) { return x * 2; } +int test_function_calls(int cond, int x, int y) { + // CHECK-LABEL: define {{.*}} @test_function_calls + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[CALL1:%.*]] = call i32 @helper(i32 noundef %{{.*}}) + // CHECK-DAG: [[CALL2:%.*]] = call i32 @helper(i32 noundef %{{.*}}) + // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 [[CALL1]], i32 [[CALL2]]) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(cond, helper(x), helper(y)); +} + +// Test using ct_select as condition for another ct_select +int test_intrinsic_condition(int cond1, int cond2, int a, int b, int c, int d) { + // CHECK-LABEL: define {{.*}} @test_intrinsic_condition + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[INNER_COND:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK-DAG: [[FINAL_COND:%.*]] = icmp ne i32 [[INNER_COND]], 0 + // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[FINAL_COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(__builtin_ct_select(cond1, cond2, a), b, c); +} + +// Test using comparison result of ct_select as condition +int test_comparison_condition(int cond, int a, int b, int c, int d) { + // CHECK-LABEL: define {{.*}} @test_comparison_condition + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[FIRST_SELECT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: [[CMP:%.*]] = icmp sgt i32 [[FIRST_SELECT]], %{{.*}} + // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[CMP]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(__builtin_ct_select(cond, a, b) > c, d, a); +} + +// Test using ct_select result in arithmetic as condition +int test_arithmetic_condition(int cond, int a, int b, int c, int d) { + // CHECK-LABEL: define {{.*}} @test_arithmetic_condition + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[FIRST_SELECT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: [[ADD:%.*]] = add nsw i32 [[FIRST_SELECT]], %{{.*}} + // CHECK: [[FINAL_COND:%.*]] = icmp ne i32 [[ADD]], 0 + // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[FINAL_COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(__builtin_ct_select(cond, a, b) + c, d, a); +} + +// Test chained ct_select as conditions +int test_chained_conditions(int cond1, int cond2, int cond3, int a, int b, int c, int d, int e) { + // CHECK-LABEL: define {{.*}} @test_chained_conditions + // CHECK: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[FIRST:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND1]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[SECOND:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND2]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK-DAG: [[FINAL_COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[FINAL_COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + int first_select = __builtin_ct_select(cond1, a, b); + int second_select = __builtin_ct_select(cond2, first_select, c); + return __builtin_ct_select(second_select, d, e); +} + +// Test using ct_select with pointer condition +//int test_pointer_condition(int *ptr1, int *ptr2, int a, int b, int c) { + // NO-CHECK-LABEL: define {{.*}} @test_pointer_condition + // NO-CHECK: [[PTR_COND:%.*]] = icmp ne ptr %{{.*}}, null + // NO-CHECK: [[PTR_SELECT:%.*]] = call ptr @llvm.ct.select.p0(i1 [[PTR_COND]], ptr %{{.*}}, ptr %{{.*}}) + // NO-CHECK: [[FINAL_COND:%.*]] = icmp ne ptr [[PTR_SELECT]], null + // NO-CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[FINAL_COND]], i32 %{{.*}}, i32 %{{.*}}) + // NO-CHECK: ret i32 [[RESULT]] +// return __builtin_ct_select(__builtin_ct_select(ptr1, ptr1, ptr2), a, b); +//} + + +// Test using ct_select result in logical operations as condition +int test_logical_condition(int cond1, int cond2, int a, int b, int c, int d) { + // CHECK-LABEL: define {{.*}} @test_logical_condition + // CHECK-DAG: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[FIRST_SELECT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND1]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK-DAG: [[SELECT_BOOL:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(__builtin_ct_select(cond1, a, b) && cond2, c, d); +} + +// Test multiple levels of ct_select as conditions +int test_deep_condition_nesting(int cond1, int cond2, int cond3, int a, int b, int c, int d, int e, int f) { + // CHECK-LABEL: define {{.*}} @test_deep_condition_nesting + // CHECK-DAG: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[INNER1:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND2]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK-DAG: [[INNER1_COND:%.*]] = icmp ne i32 [[INNER1]], 0 + // CHECK-DAG: [[INNER2:%.*]] = call i32 @llvm.ct.select.i32(i1 [[INNER1_COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK-DAG: [[OUTER:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND1]], i32 [[INNER2]], i32 %{{.*}}) + // CHECK-DAG: [[FINAL_COND:%.*]] = icmp ne i32 [[OUTER]], 0 + // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[FINAL_COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(__builtin_ct_select(cond1, __builtin_ct_select(__builtin_ct_select(cond2, a, b), c, d), e), f, a); +} + +// Test ct_select with complex condition expressions +int test_complex_condition_expr(int x, int y, int z, int a, int b) { + // CHECK-LABEL: define {{.*}} @test_complex_condition_expr + // CHECK: [[CMP1:%.*]] = icmp sgt i32 %{{.*}}, %{{.*}} + // CHECK: [[SELECT1:%.*]] = call i32 @llvm.ct.select.i32(i1 [[CMP1]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: [[CMP2:%.*]] = icmp slt i32 [[SELECT1]], %{{.*}} + // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[CMP2]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + return __builtin_ct_select(__builtin_ct_select(x > y, x, y) < z, a, b); +} + +// Test vector types - 128-bit vectors +typedef int __attribute__((vector_size(16))) int4; +typedef float __attribute__((vector_size(16))) float4; +typedef short __attribute__((vector_size(16))) short8; +typedef char __attribute__((vector_size(16))) char16; + +int4 test_vector_int4(int cond, int4 a, int4 b) { + // CHECK-LABEL: define {{.*}} @test_vector_int4 + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 [[COND]], <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: ret <4 x i32> [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +float4 test_vector_float4(int cond, float4 a, float4 b) { + // CHECK-LABEL: define {{.*}} @test_vector_float4 + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND]], <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: ret <4 x float> [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +short8 test_vector_short8(int cond, short8 a, short8 b) { + // CHECK-LABEL: define {{.*}} @test_vector_short8 + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <8 x i16> @llvm.ct.select.v8i16(i1 [[COND]], <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) + // CHECK: ret <8 x i16> [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +char16 test_vector_char16(int cond, char16 a, char16 b) { + // CHECK-LABEL: define {{.*}} @test_vector_char16 + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <16 x i8> @llvm.ct.select.v16i8(i1 [[COND]], <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) + // CHECK: ret <16 x i8> [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +// Test 256-bit vectors +typedef int __attribute__((vector_size(32))) int8; +typedef float __attribute__((vector_size(32))) float8; +typedef double __attribute__((vector_size(32))) double4; + +int8 test_vector_int8(int cond, int8 a, int8 b) { + // CHECK-LABEL: define {{.*}} @test_vector_int8 + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call <8 x i32> @llvm.ct.select.v8i32(i1 [[COND]], <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + return __builtin_ct_select(cond, a, b); +} + +float8 test_vector_float8(int cond, float8 a, float8 b) { + // CHECK-LABEL: define {{.*}} @test_vector_float8 + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call <8 x float> @llvm.ct.select.v8f32(i1 [[COND]], <8 x float> %{{.*}}, <8 x float> %{{.*}}) + return __builtin_ct_select(cond, a, b); +} + +double4 test_vector_double4(int cond, double4 a, double4 b) { + // CHECK-LABEL: define {{.*}} @test_vector_double4 + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call <4 x double> @llvm.ct.select.v4f64(i1 [[COND]], <4 x double> %{{.*}}, <4 x double> %{{.*}}) + return __builtin_ct_select(cond, a, b); +} + +// Test 512-bit vectors +typedef int __attribute__((vector_size(64))) int16; +typedef float __attribute__((vector_size(64))) float16; + +int16 test_vector_int16(int cond, int16 a, int16 b) { + // CHECK-LABEL: define {{.*}} @test_vector_int16 + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <16 x i32> @llvm.ct.select.v16i32(i1 [[COND]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) + return __builtin_ct_select(cond, a, b); +} + +float16 test_vector_float16(int cond, float16 a, float16 b) { + // CHECK-LABEL: define {{.*}} @test_vector_float16 + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <16 x float> @llvm.ct.select.v16f32(i1 [[COND]], <16 x float> %{{.*}}, <16 x float> %{{.*}}) + return __builtin_ct_select(cond, a, b); +} + +// Test vector operations with different condition types +int4 test_vector_char_cond(char cond, int4 a, int4 b) { + // CHECK-LABEL: define {{.*}} @test_vector_char_cond + // CHECK: [[COND:%.*]] = icmp ne i8 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 [[COND]], <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: ret <4 x i32> [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +float4 test_vector_long_cond(long cond, float4 a, float4 b) { + // CHECK-LABEL: define {{.*}} @test_vector_long_cond + // CHECK: [[COND:%.*]] = icmp ne i64 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND]], <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: ret <4 x float> [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +// Test vector constants +int4 test_vector_constant_cond(void) { + // CHECK-LABEL: define {{.*}} @test_vector_constant_cond + // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 true, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: ret <4 x i32> [[RESULT]] + int4 a = {1, 2, 3, 4}; + int4 b = {5, 6, 7, 8}; + return __builtin_ct_select(1, a, b); +} + +float4 test_vector_zero_cond(void) { + // CHECK-LABEL: define {{.*}} @test_vector_zero_cond + // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 false, <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: ret <4 x float> [[RESULT]] + float4 a = {1.0f, 2.0f, 3.0f, 4.0f}; + float4 b = {5.0f, 6.0f, 7.0f, 8.0f}; + return __builtin_ct_select(0, a, b); +} + +// Test nested vector selections +int4 test_vector_nested(int cond1, int cond2, int4 a, int4 b, int4 c) { + // CHECK-LABEL: define {{.*}} @test_vector_nested + // CHECK-DAG: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[INNER:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 [[COND2]], <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 [[COND1]], <4 x i32> [[INNER]], <4 x i32> %{{.*}}) + // CHECK: ret <4 x i32> [[RESULT]] + return __builtin_ct_select(cond1, __builtin_ct_select(cond2, a, b), c); +} + +// Test vector selection with complex expressions +float4 test_vector_complex_expr(int x, int y, float4 a, float4 b) { + // CHECK-LABEL: define {{.*}} @test_vector_complex_expr + // CHECK: [[CMP:%.*]] = icmp sgt i32 %{{.*}}, %{{.*}} + // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[CMP]], <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: ret <4 x float> [[RESULT]] + return __builtin_ct_select(x > y, a, b); +} + +// Test vector with different element sizes +typedef long long __attribute__((vector_size(16))) long2; +typedef double __attribute__((vector_size(16))) double2; + +long2 test_vector_long2(int cond, long2 a, long2 b) { + // CHECK-LABEL: define {{.*}} @test_vector_long2 + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <2 x i64> @llvm.ct.select.v2i64(i1 [[COND]], <2 x i64> %{{.*}}, <2 x i64> %{{.*}}) + // CHECK: ret <2 x i64> [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +double2 test_vector_double2(int cond, double2 a, double2 b) { + // CHECK-LABEL: define {{.*}} @test_vector_double2 + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <2 x double> @llvm.ct.select.v2f64(i1 [[COND]], <2 x double> %{{.*}}, <2 x double> %{{.*}}) + // CHECK: ret <2 x double> [[RESULT]] + return __builtin_ct_select(cond, a, b); +} + +// Test mixed vector operations +int4 test_vector_from_scalar_condition(int4 vec_cond, int4 a, int4 b) { + // CHECK-LABEL: define {{.*}} @test_vector_from_scalar_condition + // Extract first element and use as condition + int scalar_cond = vec_cond[0]; + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 [[COND]], <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: ret <4 x i32> [[RESULT]] + return __builtin_ct_select(scalar_cond, a, b); +} + +// Test vector chaining +float4 test_vector_chaining(int cond1, int cond2, int cond3, float4 a, float4 b, float4 c, float4 d) { + // CHECK-LABEL: define {{.*}} @test_vector_chaining + // CHECK-DAG: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[COND3:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[FIRST:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND1]], <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK-DAG: [[SECOND:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND2]], <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK-DAG: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND3]], <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: ret <4 x float> [[RESULT]] + float4 first = __builtin_ct_select(cond1, a, b); + float4 second = __builtin_ct_select(cond2, first, c); + return __builtin_ct_select(cond3, second, d); +} + +// Test special floating point values - NaN +float test_nan_operands(int cond) { + // CHECK-LABEL: define {{.*}} @test_nan_operands + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call float @llvm.ct.select.f32(i1 [[COND]], float %{{.*}}, float 1.000000e+00) + // CHECK: ret float [[RESULT]] + float nan_val = __builtin_nanf(""); + return __builtin_ct_select(cond, nan_val, 1.0f); +} + +double test_nan_double_operands(int cond) { + // CHECK-LABEL: define {{.*}} @test_nan_double_operands + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call double @llvm.ct.select.f64(i1 [[COND]], double %{{.*}}, double 2.000000e+00) + // CHECK: ret double [[RESULT]] + double nan_val = __builtin_nan(""); + return __builtin_ct_select(cond, nan_val, 2.0); +} + +// Test infinity values +float test_infinity_operands(int cond) { + // CHECK-LABEL: define {{.*}} @test_infinity_operands + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call float @llvm.ct.select.f32(i1 [[COND]], float %{{.*}}, float %{{.*}}) + // CHECK: ret float [[RESULT]] + float pos_inf = __builtin_inff(); + float neg_inf = -__builtin_inff(); + return __builtin_ct_select(cond, pos_inf, neg_inf); +} + +double test_infinity_double_operands(int cond) { + // CHECK-LABEL: define {{.*}} @test_infinity_double_operands + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call double @llvm.ct.select.f64(i1 [[COND]], double %{{.*}}, double %{{.*}}) + // CHECK: ret double [[RESULT]] + double pos_inf = __builtin_inf(); + double neg_inf = -__builtin_inf(); + return __builtin_ct_select(cond, pos_inf, neg_inf); +} + +// Test subnormal/denormal values +float test_subnormal_operands(int cond) { + // CHECK-LABEL: define {{.*}} @test_subnormal_operands + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call float @llvm.ct.select.f32(i1 [[COND]], float %{{.*}}, float %{{.*}}) + // CHECK: ret float [[RESULT]] + // Very small subnormal values + float subnormal1 = 1e-40f; + float subnormal2 = 1e-45f; + return __builtin_ct_select(cond, subnormal1, subnormal2); +} + +// Test integer overflow boundaries +int test_integer_overflow_operands(int cond) { + // CHECK-LABEL: define {{.*}} @test_integer_overflow_operands + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + int max_int = __INT_MAX__; + int min_int = (-__INT_MAX__ - 1); + return __builtin_ct_select(cond, max_int, min_int); +} + +long long test_longlong_overflow_operands(int cond) { + // CHECK-LABEL: define {{.*}} @test_longlong_overflow_operands + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call i64 @llvm.ct.select.i64(i1 [[COND]], i64 %{{.*}}, i64 %{{.*}}) + // CHECK: ret i64 [[RESULT]] + long long max_ll = __LONG_LONG_MAX__; + long long min_ll = (-__LONG_LONG_MAX__ - 1); + return __builtin_ct_select(cond, max_ll, min_ll); +} + +// Test unsigned overflow boundaries +unsigned int test_unsigned_overflow_operands(int cond) { + // CHECK-LABEL: define {{.*}} @test_unsigned_overflow_operands + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + unsigned int max_uint = 4294967295; + unsigned int min_uint = 0; + return __builtin_ct_select(cond, max_uint, min_uint); +} + +// Test null pointer dereference avoidance +int* test_null_pointer_operands(int cond, int* valid_ptr) { + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call ptr @llvm.ct.select.p0(i1 [[COND]], ptr %{{.*}}, ptr %{{.*}}) + // CHECK: ret ptr [[RESULT]] + int* null_ptr = (int*)0; + return __builtin_ct_select(cond, null_ptr, valid_ptr); +} + +// Test volatile operations +volatile int global_volatile = 42; +int test_volatile_operands(int cond) { + // CHECK-LABEL: define {{.*}} @test_volatile_operands + // CHECK-DAG: [[VOLATILE_LOAD:%.*]] = load volatile i32, ptr {{.*}} + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 100) + // CHECK: ret i32 [[RESULT]] + volatile int vol_val = global_volatile; + return __builtin_ct_select(cond, vol_val, 100); +} + +// Test uninitialized variable behavior (should still work with ct_select) +int test_uninitialized_operands(int cond, int initialized) { + // CHECK-LABEL: define {{.*}} @test_uninitialized_operands + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + int uninitialized; // Intentionally uninitialized + return __builtin_ct_select(cond, uninitialized, initialized); +} + +// Test zero division avoidance patterns +int test_division_by_zero_avoidance(int cond, int dividend, int divisor) { + // CHECK-LABEL: define {{.*}} @test_division_by_zero_avoidance + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[DIV_RESULT:%.*]] = sdiv i32 %{{.*}}, %{{.*}} + // CHECK-DAG: [[SAFE_DIVISOR:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 1) + // First get a safe divisor (never zero) + int safe_divisor = __builtin_ct_select(divisor != 0, divisor, 1); + // Then perform division with guaranteed non-zero divisor + return dividend / safe_divisor; +} + +// Test array bounds checking patterns +int test_array_bounds_protection(int cond, int index, int* array) { + // CHECK-LABEL: define {{.*}} @test_array_bounds_protection + // CHECK-DAG: [[SAFE_INDEX:%.*]] = call i32 @llvm.ct.select.i32(i1 {{.*}}, i32 %{{.*}}, i32 0) + // Use ct_select to ensure safe array indexing + int safe_index = __builtin_ct_select(index >= 0 && index < 10, index, 0); + return array[safe_index]; +} + +// Test bit manipulation edge cases +unsigned int test_bit_manipulation_edge_cases(int cond, unsigned int value) { + // CHECK-LABEL: define {{.*}} @test_bit_manipulation_edge_cases + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[SHIFT_LEFT:%.*]] = shl i32 %{{.*}}, 31 + // CHECK-DAG: [[SHIFT_RIGHT:%.*]] = lshr i32 %{{.*}}, 31 + // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + // Test extreme bit shifts that could cause undefined behavior + unsigned int left_shift = value << 31; // Could overflow + unsigned int right_shift = value >> 31; // Extract sign bit + return __builtin_ct_select(cond, left_shift, right_shift); +} + +// Test signed integer wraparound +int test_signed_wraparound(int cond, int a, int b) { + // CHECK-LABEL: define {{.*}} @test_signed_wraparound + // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK-DAG: [[ADD:%.*]] = add nsw i32 %{{.*}}, %{{.*}} + // CHECK-DAG: [[SUB:%.*]] = sub nsw i32 %{{.*}}, %{{.*}} + // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}}) + // CHECK: ret i32 [[RESULT]] + int sum = a + b; // Could overflow + int diff = a - b; // Could underflow + return __builtin_ct_select(cond, sum, diff); +} + +// Test vector NaN handling +float4 test_vector_nan_operands(int cond) { + // CHECK-LABEL: define {{.*}} @test_vector_nan_operands + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND]], <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: ret <4 x float> [[RESULT]] + float nan_val = __builtin_nanf(""); + float4 nan_vec = {nan_val, nan_val, nan_val, nan_val}; + float4 normal_vec = {1.0f, 2.0f, 3.0f, 4.0f}; + return __builtin_ct_select(cond, nan_vec, normal_vec); +} + +// Test vector infinity handling +float4 test_vector_infinity_operands(int cond) { + // CHECK-LABEL: define {{.*}} @test_vector_infinity_operands + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND]], <4 x float> %{{.*}}, <4 x float> %{{.*}}) + // CHECK: ret <4 x float> [[RESULT]] + float pos_inf = __builtin_inff(); + float neg_inf = -__builtin_inff(); + float4 inf_vec = {pos_inf, neg_inf, pos_inf, neg_inf}; + float4 zero_vec = {0.0f, 0.0f, 0.0f, 0.0f}; + return __builtin_ct_select(cond, inf_vec, zero_vec); +} + +// Test mixed special values +double test_mixed_special_values(int cond) { + // CHECK-LABEL: define {{.*}} @test_mixed_special_values + // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0 + // CHECK: [[RESULT:%.*]] = call double @llvm.ct.select.f64(i1 [[COND]], double %{{.*}}, double %{{.*}}) + // CHECK: ret double [[RESULT]] + double nan_val = __builtin_nan(""); + double inf_val = __builtin_inf(); + return __builtin_ct_select(cond, nan_val, inf_val); +} + +// Test constant-time memory access pattern +int test_constant_time_memory_access(int secret_index, int* data_array) { + // CHECK-LABEL: define {{.*}} @test_constant_time_memory_access + // This pattern ensures constant-time memory access regardless of secret_index value + int result = 0; + // Use ct_select to accumulate values without revealing the secret index + for (int i = 0; i < 8; i++) { + int is_target = (i == secret_index); + int current_value = data_array[i]; + int selected_value = __builtin_ct_select(is_target, current_value, 0); + result += selected_value; + } + return result; +} + +// Test timing-attack resistant comparison +int test_timing_resistant_comparison(const char* secret, const char* guess) { + // CHECK-LABEL: define {{.*}} @test_timing_resistant_comparison + // Constant-time string comparison using ct_select + int match = 1; + for (int i = 0; i < 32; i++) { + int chars_equal = (secret[i] == guess[i]); + int both_null = (secret[i] == 0) && (guess[i] == 0); + int still_matching = __builtin_ct_select(chars_equal || both_null, match, 0); + match = __builtin_ct_select(both_null, match, still_matching); + } + return match; +} diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index ff3dd0d4c3c51..656f6e718f029 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -783,6 +783,10 @@ enum NodeType { /// i1 then the high bits must conform to getBooleanContents. SELECT, + /// Constant-time Select, implemented with CMOV instruction. This is used to + /// implement constant-time select. + CTSELECT, + /// Select with a vector condition (op #0) and two vector operands (ops #1 /// and #2), returning a vector result. All vectors have the same length. /// Much like the scalar select and setcc, each bit in the condition selects diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index df6ce0fe1b037..00d2f5bd6c8eb 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1348,6 +1348,13 @@ class SelectionDAG { return getNode(Opcode, DL, VT, Cond, LHS, RHS, Flags); } + SDValue getCTSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, + SDValue RHS, SDNodeFlags Flags = SDNodeFlags()) { + assert(LHS.getValueType() == VT && RHS.getValueType() == VT && + "Cannot use select on differing types"); + return getNode(ISD::CTSELECT, DL, VT, Cond, LHS, RHS, Flags); + } + /// Helper function to make it easier to build SelectCC's if you just have an /// ISD::CondCode instead of an SDValue. SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index 69713d0d84011..55c62ff7e7216 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -426,6 +426,10 @@ struct SDNodeFlags { NonNeg | NoNaNs | NoInfs | SameSign, FastMathFlags = NoNaNs | NoInfs | NoSignedZeros | AllowReciprocal | AllowContract | ApproximateFuncs | AllowReassociation, + + // Instructs DAGCombiner to skip optimization passes for this node. + // Preserves the operation as-is without folding, merging, or elimination. + NoMerge = 1 << 15, }; /// Default constructor turns off all optimization flags. @@ -458,6 +462,7 @@ struct SDNodeFlags { void setAllowReassociation(bool b) { setFlag(b); } void setNoFPExcept(bool b) { setFlag(b); } void setUnpredictable(bool b) { setFlag(b); } + void setNoMerge(bool b) { setFlag(b); } // These are accessors for each flag. bool hasNoUnsignedWrap() const { return Flags & NoUnsignedWrap; } @@ -475,6 +480,7 @@ struct SDNodeFlags { bool hasAllowReassociation() const { return Flags & AllowReassociation; } bool hasNoFPExcept() const { return Flags & NoFPExcept; } bool hasUnpredictable() const { return Flags & Unpredictable; } + bool hasNoMerge() const { return Flags & NoMerge; } bool operator==(const SDNodeFlags &Other) const { return Flags == Other.Flags; diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 73f2c55a71125..375a4bf4c5c03 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -242,11 +242,15 @@ class LLVM_ABI TargetLoweringBase { /// Enum that describes what type of support for selects the target has. enum SelectSupportKind { - ScalarValSelect, // The target supports scalar selects (ex: cmov). - ScalarCondVectorVal, // The target supports selects with a scalar condition - // and vector values (ex: cmov). - VectorMaskSelect // The target supports vector selects with a vector - // mask (ex: x86 blends). + ScalarValSelect, // The target supports scalar selects (ex: cmov). + ScalarCondVectorVal, // The target supports selects with a scalar condition + // and vector values (ex: cmov). + VectorMaskSelect, // The target supports vector selects with a vector + // mask (ex: x86 blends). + CtSelect, // The target implements a custom constant-time select. + ScalarCondVectorValCtSelect, // The target supports selects with a scalar + // condition and vector values. + VectorMaskValCtSelect, // The target supports vector selects with a vector }; /// Enum that specifies what an atomic load/AtomicRMWInst is expanded @@ -476,8 +480,8 @@ class LLVM_ABI TargetLoweringBase { MachineMemOperand::Flags getVPIntrinsicMemOperandFlags(const VPIntrinsic &VPIntrin) const; - virtual bool isSelectSupported(SelectSupportKind /*kind*/) const { - return true; + virtual bool isSelectSupported(SelectSupportKind kind) const { + return kind != CtSelect; } /// Return true if the @llvm.get.active.lane.mask intrinsic should be expanded diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 8856eda250ed6..32f8fce3f05d9 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1813,6 +1813,13 @@ def int_coro_subfn_addr : DefaultAttrsIntrinsic< [IntrReadMem, IntrArgMemOnly, ReadOnly>, NoCapture>]>; +///===-------------------------- Constant Time Intrinsics --------------------------===// +// +// Intrinsic to support constant time select +def int_ct_select : DefaultAttrsIntrinsic<[llvm_any_ty], + [llvm_i1_ty, LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem, IntrWillReturn, IntrNoDuplicate, NoUndef]>; + ///===-------------------------- Other Intrinsics --------------------------===// // // TODO: We should introduce a new memory kind fo traps (and other side effects diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 07a858fd682fc..c783a2aa9258f 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -214,6 +214,10 @@ def SDTSelect : SDTypeProfile<1, 3, [ // select SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3> ]>; +def SDTCtSelect : SDTypeProfile<1, 3, [ // ctselect + SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3> +]>; + def SDTVSelect : SDTypeProfile<1, 3, [ // vselect SDTCisVec<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>, SDTCisSameNumEltsAs<0, 1> ]>; @@ -717,6 +721,7 @@ def reset_fpmode : SDNode<"ISD::RESET_FPMODE", SDTNone, [SDNPHasChain]>; def setcc : SDNode<"ISD::SETCC" , SDTSetCC>; def select : SDNode<"ISD::SELECT" , SDTSelect>; +def ctselect : SDNode<"ISD::CTSELECT" , SDTCtSelect>; def vselect : SDNode<"ISD::VSELECT" , SDTVSelect>; def selectcc : SDNode<"ISD::SELECT_CC" , SDTSelectCC>; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c97300d64d455..06167fb7c79d6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -484,6 +484,7 @@ namespace { SDValue visitCTTZ_ZERO_UNDEF(SDNode *N); SDValue visitCTPOP(SDNode *N); SDValue visitSELECT(SDNode *N); + SDValue visitCTSELECT(SDNode *N); SDValue visitVSELECT(SDNode *N); SDValue visitVP_SELECT(SDNode *N); SDValue visitSELECT_CC(SDNode *N); @@ -1898,6 +1899,9 @@ void DAGCombiner::Run(CombineLevel AtLevel) { } SDValue DAGCombiner::visit(SDNode *N) { + if (N->getFlags().hasNoMerge()) + return SDValue(); + // clang-format off switch (N->getOpcode()) { default: break; @@ -1968,6 +1972,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::CTTZ_ZERO_UNDEF: return visitCTTZ_ZERO_UNDEF(N); case ISD::CTPOP: return visitCTPOP(N); case ISD::SELECT: return visitSELECT(N); + case ISD::CTSELECT: return visitCTSELECT(N); case ISD::VSELECT: return visitVSELECT(N); case ISD::SELECT_CC: return visitSELECT_CC(N); case ISD::SETCC: return visitSETCC(N); @@ -6016,6 +6021,7 @@ static SDValue isSaturatingMinMax(SDValue N0, SDValue N1, SDValue N2, N0CC = cast(N0.getOperand(4))->get(); break; case ISD::SELECT: + case ISD::CTSELECT: case ISD::VSELECT: if (N0.getOperand(0).getOpcode() != ISD::SETCC) return SDValue(); @@ -12168,8 +12174,9 @@ template static SDValue foldBoolSelectToLogic(SDNode *N, const SDLoc &DL, SelectionDAG &DAG) { assert((N->getOpcode() == ISD::SELECT || N->getOpcode() == ISD::VSELECT || - N->getOpcode() == ISD::VP_SELECT) && - "Expected a (v)(vp.)select"); + N->getOpcode() == ISD::VP_SELECT || + N->getOpcode() == ISD::CTSELECT) && + "Expected a (v)(vp.)(ct) select"); SDValue Cond = N->getOperand(0); SDValue T = N->getOperand(1), F = N->getOperand(2); EVT VT = N->getValueType(0); @@ -12531,6 +12538,109 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitCTSELECT(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue N2 = N->getOperand(2); + EVT VT = N->getValueType(0); + EVT VT0 = N0.getValueType(); + SDLoc DL(N); + SDNodeFlags Flags = N->getFlags(); + + if (SDValue V = foldBoolSelectToLogic(N, DL, DAG)) + return V; + + // ctselect (not Cond), N1, N2 -> ctselect Cond, N2, N1 + if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) { + SDValue SelectOp = DAG.getNode(ISD::CTSELECT, DL, VT, F, N2, N1); + SelectOp->setFlags(Flags); + return SelectOp; + } + + if (VT0 == MVT::i1) { + // The code in this block deals with the following 2 equivalences: + // select(C0|C1, x, y) <=> select(C0, x, select(C1, x, y)) + // select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y) + // The target can specify its preferred form with the + // shouldNormalizeToSelectSequence() callback. However we always transform + // to the right anyway if we find the inner select exists in the DAG anyway + // and we always transform to the left side if we know that we can further + // optimize the combination of the conditions. + bool normalizeToSequence = + TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT); + // ctselect (and Cond0, Cond1), X, Y + // -> ctselect Cond0, (ctselect Cond1, X, Y), Y + if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) { + SDValue Cond0 = N0->getOperand(0); + SDValue Cond1 = N0->getOperand(1); + SDValue InnerSelect = DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), + Cond1, N1, N2, Flags); + if (normalizeToSequence || !InnerSelect.use_empty()) + return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Cond0, + InnerSelect, N2, Flags); + // Cleanup on failure. + if (InnerSelect.use_empty()) + recursivelyDeleteUnusedNodes(InnerSelect.getNode()); + } + // ctselect (or Cond0, Cond1), X, Y -> ctselect Cond0, X, (ctselect Cond1, + // X, Y) + if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) { + SDValue Cond0 = N0->getOperand(0); + SDValue Cond1 = N0->getOperand(1); + SDValue InnerSelect = DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), + Cond1, N1, N2, Flags); + if (normalizeToSequence || !InnerSelect.use_empty()) + return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Cond0, N1, + InnerSelect, Flags); + // Cleanup on failure. + if (InnerSelect.use_empty()) + recursivelyDeleteUnusedNodes(InnerSelect.getNode()); + } + + // ctselect Cond0, (ctselect Cond1, X, Y), Y -> ctselect (and Cond0, Cond1), + // X, Y + if (N1->getOpcode() == ISD::CTSELECT && N1->hasOneUse()) { + SDValue N1_0 = N1->getOperand(0); + SDValue N1_1 = N1->getOperand(1); + SDValue N1_2 = N1->getOperand(2); + if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) { + // Create the actual and node if we can generate good code for it. + if (!normalizeToSequence) { + SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0); + return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), And, N1_1, + N2, Flags); + } + // Otherwise see if we can optimize the "and" to a better pattern. + if (SDValue Combined = visitANDLike(N0, N1_0, N)) { + return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Combined, + N1_1, N2, Flags); + } + } + } + // ctselect Cond0, X, (ctselect Cond1, X, Y) -> ctselect (or Cond0, Cond1), + // X, Y + if (N2->getOpcode() == ISD::CTSELECT && N2->hasOneUse()) { + SDValue N2_0 = N2->getOperand(0); + SDValue N2_1 = N2->getOperand(1); + SDValue N2_2 = N2->getOperand(2); + if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) { + // Create the actual or node if we can generate good code for it. + if (!normalizeToSequence) { + SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0); + return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Or, N1, N2_2, + Flags); + } + // Otherwise see if we can optimize to a better pattern. + if (SDValue Combined = visitORLike(N0, N2_0, DL)) + return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Combined, N1, + N2_2, Flags); + } + } + } + + return SDValue(); +} + // This function assumes all the vselect's arguments are CONCAT_VECTOR // nodes and that the condition is a BV of ConstantSDNodes (or undefs). static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) { diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 5fb7e63cfb605..54d51aaa15442 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -4135,6 +4135,40 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { } Results.push_back(Tmp1); break; + case ISD::CTSELECT: { + Tmp1 = Node->getOperand(0); + Tmp2 = Node->getOperand(1); + Tmp3 = Node->getOperand(2); + EVT VT = Tmp2.getValueType(); + if (VT.isVector()) { + SmallVector Elements; + unsigned NumElements = VT.getVectorNumElements(); + EVT ScalarVT = VT.getScalarType(); + for (unsigned Idx = 0; Idx < NumElements; ++Idx) { + SDValue IdxVal = DAG.getConstant(Idx, dl, MVT::i64); + SDValue TVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Tmp2, IdxVal); + SDValue FVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Tmp3, IdxVal); + Elements.push_back(DAG.getCTSelect(dl, ScalarVT, Tmp1, TVal, FVal, Node->getFlags())); + } + Tmp1 = DAG.getBuildVector(VT, dl, Elements); + } else if (VT.isFloatingPoint()) { + EVT IntegerVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); + Tmp2 = DAG.getBitcast(IntegerVT, Tmp2); + Tmp3 = DAG.getBitcast(IntegerVT, Tmp3); + Tmp1 = DAG.getBitcast(VT, DAG.getCTSelect(dl, IntegerVT, Tmp1, Tmp2, Tmp3, Node->getFlags())); + } else { + assert(VT.isInteger()); + EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); + auto [Tmp2Lo, Tmp2Hi] = DAG.SplitScalar(Tmp2, dl, HalfVT, HalfVT); + auto [Tmp3Lo, Tmp3Hi] = DAG.SplitScalar(Tmp3, dl, HalfVT, HalfVT); + SDValue ResLo = DAG.getCTSelect(dl, HalfVT, Tmp1, Tmp2Lo, Tmp3Lo, Node->getFlags()); + SDValue ResHi = DAG.getCTSelect(dl, HalfVT, Tmp1, Tmp2Hi, Tmp3Hi, Node->getFlags()); + Tmp1 = DAG.getNode(ISD::BUILD_PAIR, dl, VT, ResLo, ResHi); + Tmp1->setFlags(Node->getFlags()); + } + Results.push_back(Tmp1); + break; + } case ISD::BR_JT: { SDValue Chain = Node->getOperand(0); SDValue Table = Node->getOperand(1); @@ -5473,7 +5507,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp2)); break; } - case ISD::SELECT: { + case ISD::SELECT: + case ISD::CTSELECT: { unsigned ExtOp, TruncOp; if (Node->getValueType(0).isVector() || Node->getValueType(0).getSizeInBits() == NVT.getSizeInBits()) { @@ -5491,7 +5526,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1)); Tmp3 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(2)); // Perform the larger operation, then round down. - Tmp1 = DAG.getSelect(dl, NVT, Tmp1, Tmp2, Tmp3); + Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2, Tmp3); + Tmp1->setFlags(Node->getFlags()); if (TruncOp != ISD::FP_ROUND) Tmp1 = DAG.getNode(TruncOp, dl, Node->getValueType(0), Tmp1); else diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 437d0f4654096..61251e58046d3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -159,6 +159,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { case ISD::ATOMIC_LOAD: R = SoftenFloatRes_ATOMIC_LOAD(N); break; case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break; case ISD::SELECT: R = SoftenFloatRes_SELECT(N); break; + case ISD::CTSELECT: R = SoftenFloatRes_CTSELECT(N); break; case ISD::SELECT_CC: R = SoftenFloatRes_SELECT_CC(N); break; case ISD::FREEZE: R = SoftenFloatRes_FREEZE(N); break; case ISD::STRICT_SINT_TO_FP: @@ -1041,6 +1042,13 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N) { LHS.getValueType(), N->getOperand(0), LHS, RHS); } +SDValue DAGTypeLegalizer::SoftenFloatRes_CTSELECT(SDNode *N) { + SDValue LHS = GetSoftenedFloat(N->getOperand(1)); + SDValue RHS = GetSoftenedFloat(N->getOperand(2)); + return DAG.getCTSelect(SDLoc(N), LHS.getValueType(), N->getOperand(0), LHS, + RHS); +} + SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N) { SDValue LHS = GetSoftenedFloat(N->getOperand(2)); SDValue RHS = GetSoftenedFloat(N->getOperand(3)); @@ -1541,6 +1549,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) { case ISD::POISON: case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break; case ISD::SELECT: SplitRes_Select(N, Lo, Hi); break; + case ISD::CTSELECT: SplitRes_Select(N, Lo, Hi); break; case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break; case ISD::MERGE_VALUES: ExpandRes_MERGE_VALUES(N, ResNo, Lo, Hi); break; @@ -2897,6 +2906,9 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) { R = PromoteFloatRes_ATOMIC_LOAD(N); break; case ISD::SELECT: R = PromoteFloatRes_SELECT(N); break; + case ISD::CTSELECT: + R = PromoteFloatRes_SELECT(N); + break; case ISD::SELECT_CC: R = PromoteFloatRes_SELECT_CC(N); break; case ISD::SINT_TO_FP: @@ -3199,7 +3211,7 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_SELECT(SDNode *N) { SDValue TrueVal = GetPromotedFloat(N->getOperand(1)); SDValue FalseVal = GetPromotedFloat(N->getOperand(2)); - return DAG.getNode(ISD::SELECT, SDLoc(N), TrueVal->getValueType(0), + return DAG.getNode(N->getOpcode(), SDLoc(N), TrueVal->getValueType(0), N->getOperand(0), TrueVal, FalseVal); } @@ -3383,6 +3395,9 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) { R = SoftPromoteHalfRes_ATOMIC_LOAD(N); break; case ISD::SELECT: R = SoftPromoteHalfRes_SELECT(N); break; + case ISD::CTSELECT: + R = SoftPromoteHalfRes_SELECT(N); + break; case ISD::SELECT_CC: R = SoftPromoteHalfRes_SELECT_CC(N); break; case ISD::STRICT_SINT_TO_FP: case ISD::STRICT_UINT_TO_FP: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 88a4a8b16373b..124f61df9679b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -95,6 +95,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { Res = PromoteIntRes_VECTOR_COMPRESS(N); break; case ISD::SELECT: + case ISD::CTSELECT: case ISD::VSELECT: case ISD::VP_SELECT: case ISD::VP_MERGE: @@ -2000,6 +2001,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { break; case ISD::VSELECT: case ISD::SELECT: Res = PromoteIntOp_SELECT(N, OpNo); break; + case ISD::CTSELECT: + Res = PromoteIntOp_CTSELECT(N, OpNo); + break; case ISD::SELECT_CC: Res = PromoteIntOp_SELECT_CC(N, OpNo); break; case ISD::VP_SETCC: case ISD::SETCC: Res = PromoteIntOp_SETCC(N, OpNo); break; @@ -2377,6 +2381,19 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) { N->getOperand(2)), 0); } +SDValue DAGTypeLegalizer::PromoteIntOp_CTSELECT(SDNode *N, unsigned OpNo) { + assert(OpNo == 0 && "Only know how to promote the condition!"); + SDValue Cond = N->getOperand(0); + EVT OpTy = N->getOperand(1).getValueType(); + + // Promote all the way up to the canonical SetCC type. + EVT OpVT = N->getOpcode() == ISD::CTSELECT ? OpTy.getScalarType() : OpTy; + Cond = PromoteTargetBoolean(Cond, OpVT); + + return SDValue( + DAG.UpdateNodeOperands(N, Cond, N->getOperand(1), N->getOperand(2)), 0); +} + SDValue DAGTypeLegalizer::PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo) { assert(OpNo == 0 && "Don't know how to promote this operand!"); @@ -2978,6 +2995,9 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) { case ISD::ARITH_FENCE: SplitRes_ARITH_FENCE(N, Lo, Hi); break; case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break; case ISD::SELECT: SplitRes_Select(N, Lo, Hi); break; + case ISD::CTSELECT: + SplitRes_Select(N, Lo, Hi); + break; case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break; case ISD::POISON: case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 603dc34ce72a7..f76520ad07508 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -401,6 +401,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntOp_CONCAT_VECTORS(SDNode *N); SDValue PromoteIntOp_ScalarOp(SDNode *N); SDValue PromoteIntOp_SELECT(SDNode *N, unsigned OpNo); + SDValue PromoteIntOp_CTSELECT(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_SETCC(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_Shift(SDNode *N); @@ -633,6 +634,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue SoftenFloatRes_LOAD(SDNode *N); SDValue SoftenFloatRes_ATOMIC_LOAD(SDNode *N); SDValue SoftenFloatRes_SELECT(SDNode *N); + SDValue SoftenFloatRes_CTSELECT(SDNode *N); SDValue SoftenFloatRes_SELECT_CC(SDNode *N); SDValue SoftenFloatRes_UNDEF(SDNode *N); SDValue SoftenFloatRes_VAARG(SDNode *N); @@ -893,6 +895,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N); SDValue ScalarizeVecRes_VSELECT(SDNode *N); SDValue ScalarizeVecRes_SELECT(SDNode *N); + SDValue ScalarizeVecRes_CTSELECT(SDNode *N); SDValue ScalarizeVecRes_SELECT_CC(SDNode *N); SDValue ScalarizeVecRes_SETCC(SDNode *N); SDValue ScalarizeVecRes_UNDEF(SDNode *N); @@ -1221,7 +1224,8 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue &Lo, SDValue &Hi); void SplitVecRes_AssertZext (SDNode *N, SDValue &Lo, SDValue &Hi); void SplitRes_ARITH_FENCE (SDNode *N, SDValue &Lo, SDValue &Hi); - void SplitRes_Select (SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitRes_Select(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitRes_CTSELECT(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitRes_SELECT_CC (SDNode *N, SDValue &Lo, SDValue &Hi); void SplitRes_UNDEF (SDNode *N, SDValue &Lo, SDValue &Hi); void SplitRes_FREEZE (SDNode *N, SDValue &Lo, SDValue &Hi); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index 88c1af20a321e..098368ef2f6b3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -570,6 +570,20 @@ void DAGTypeLegalizer::SplitRes_Select(SDNode *N, SDValue &Lo, SDValue &Hi) { Hi = DAG.getNode(Opcode, dl, LH.getValueType(), CH, LH, RH, EVLHi); } +void DAGTypeLegalizer::SplitRes_CTSELECT(SDNode *N, SDValue &Lo, SDValue &Hi) { + SDValue LL, LH, RL, RH, CL, CH; + SDLoc dl(N); + GetSplitOp(N->getOperand(1), LL, LH); + GetSplitOp(N->getOperand(2), RL, RH); + + SDValue Cond = N->getOperand(0); + CL = CH = Cond; + assert(!Cond.getValueType().isVector() && "Unsupported vector type"); + + Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), CL, LL, RL); + Hi = DAG.getNode(N->getOpcode(), dl, LH.getValueType(), CH, LH, RH); +} + void DAGTypeLegalizer::SplitRes_SELECT_CC(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue LL, LH, RL, RH; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 3b5f83f7c089a..4ecc12c1f0e31 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -74,6 +74,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_InregOp(N); break; case ISD::VSELECT: R = ScalarizeVecRes_VSELECT(N); break; case ISD::SELECT: R = ScalarizeVecRes_SELECT(N); break; + case ISD::CTSELECT: + R = ScalarizeVecRes_CTSELECT(N); + break; case ISD::SELECT_CC: R = ScalarizeVecRes_SELECT_CC(N); break; case ISD::SETCC: R = ScalarizeVecRes_SETCC(N); break; case ISD::POISON: @@ -655,6 +658,12 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT(SDNode *N) { GetScalarizedVector(N->getOperand(2))); } +SDValue DAGTypeLegalizer::ScalarizeVecRes_CTSELECT(SDNode *N) { + SDValue LHS = GetScalarizedVector(N->getOperand(1)); + return DAG.getCTSelect(SDLoc(N), LHS.getValueType(), N->getOperand(0), LHS, + GetScalarizedVector(N->getOperand(2))); +} + SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT_CC(SDNode *N) { SDValue LHS = GetScalarizedVector(N->getOperand(2)); return DAG.getNode(ISD::SELECT_CC, SDLoc(N), LHS.getValueType(), @@ -1189,6 +1198,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::SELECT: case ISD::VP_MERGE: case ISD::VP_SELECT: SplitRes_Select(N, Lo, Hi); break; + case ISD::CTSELECT: + SplitRes_CTSELECT(N, Lo, Hi); + break; case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break; case ISD::POISON: case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break; @@ -4854,6 +4866,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::SIGN_EXTEND_INREG: Res = WidenVecRes_InregOp(N); break; case ISD::VSELECT: case ISD::SELECT: + case ISD::CTSELECT: case ISD::VP_SELECT: case ISD::VP_MERGE: Res = WidenVecRes_Select(N); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 90edaf3ef5471..8e14aa1f869a3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -8249,6 +8249,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, return V; break; } + case ISD::SELECT: case ISD::VSELECT: if (SDValue V = simplifySelect(N1, N2, N3)) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index cb0038c54f8c7..90d9ac76b6e57 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6489,6 +6489,172 @@ void SelectionDAGBuilder::visitVectorExtractLastActive(const CallInst &I, setValue(&I, Result); } +/// Fallback implementation is an alternative approach for managing +/// architectures that don't have native support for Constant-Time Select. This +/// function uses DAG Chaining +SDValue SelectionDAGBuilder::createProtectedCtSelectFallbackChain( + SelectionDAG &DAG, const SDLoc &DL, SDValue Cond, SDValue T, SDValue F, + EVT VT) { + + SDValue WorkingT = T; + SDValue WorkingF = F; + EVT WorkingVT = VT; + + SDValue Chain = DAG.getEntryNode(); + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + + if (VT.isVector() && !Cond.getValueType().isVector()) { + ElementCount NumElems = VT.getVectorElementCount(); + EVT CondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElems); + + if (VT.isScalableVector()) { + Cond = DAG.getSplatVector(CondVT, DL, Cond); + } else { + Cond = DAG.getSplatBuildVector(CondVT, DL, Cond); + } + } + + if (VT.isFloatingPoint()) { + if (VT.isVector()) { + // float vector -> int vector + EVT ElemVT = VT.getVectorElementType(); + unsigned int ElemBitWidth = ElemVT.getScalarSizeInBits(); + EVT IntElemVT = EVT::getIntegerVT(*DAG.getContext(), ElemBitWidth); + + WorkingVT = EVT::getVectorVT(*DAG.getContext(), IntElemVT, + VT.getVectorElementCount()); + } else { + WorkingVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); + } + + WorkingT = DAG.getBitcast(WorkingVT, T); + WorkingF = DAG.getBitcast(WorkingVT, F); + } + + SDValue Mask = DAG.getSExtOrTrunc(Cond, DL, WorkingVT); + + SDValue AllOnes; + if (WorkingVT.isScalableVector()) { + unsigned BitWidth = WorkingVT.getScalarSizeInBits(); + APInt AllOnesVal = APInt::getAllOnes(BitWidth); + SDValue ScalarAllOnes = + DAG.getConstant(AllOnesVal, DL, WorkingVT.getScalarType()); + AllOnes = DAG.getSplatVector(WorkingVT, DL, ScalarAllOnes); + } else { + AllOnes = DAG.getAllOnesConstant(DL, WorkingVT); + } + + SDValue Invert = DAG.getNode(ISD::XOR, DL, WorkingVT, Mask, AllOnes); + + // (or (and WorkingT, Mask), (and F, ~Mask)) + SDValue TM = DAG.getNode(ISD::AND, DL, WorkingVT, Mask, WorkingT); + + bool CanUseChaining = false; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + if (CanUseChaining) { + // Apply chaining through registers for additional protection + + const TargetRegisterClass *RC = TLI.getRegClassFor(WorkingVT.getSimpleVT()); + Register TMReg = MRI.createVirtualRegister(RC); + Chain = DAG.getCopyToReg(Chain, DL, TMReg, TM); + TM = DAG.getCopyFromReg(Chain, DL, TMReg, WorkingVT); + } + + SDValue FM = DAG.getNode(ISD::AND, DL, WorkingVT, Invert, WorkingF); + + if (!WorkingVT.isScalableVector()) { + // For fixed-size vectors and scalars, we can safely use register classes + CanUseChaining = TLI.isTypeLegal(WorkingVT.getSimpleVT()); + } else { + // For scalable vectors, check if the target has register class support + // This is target-specific - RISC-V might not support this directly + CanUseChaining = false; // Conservative: disable for scalable vectors + } + + + SDValue Result = DAG.getNode(ISD::OR, DL, WorkingVT, TM, FM); + + // Convert back if needed + if (WorkingVT != VT) { + Result = DAG.getBitcast(VT, Result); + } + + return Result; +} + +/// Fallback implementation is an alternative approach for managing +/// architectures that don't have native support for Constant-Time Select. This +/// function uses the NoMerge flag +SDValue SelectionDAGBuilder::createProtectedCtSelectFallbackNoMerge( + SelectionDAG &DAG, const SDLoc &DL, SDValue Cond, SDValue T, SDValue F, + EVT VT) { + SDNodeFlags ProtectedFlag; + ProtectedFlag.setNoMerge(true); + + SDValue WorkingT = T; + SDValue WorkingF = F; + EVT WorkingVT = VT; + + if (VT.isVector() && !Cond.getValueType().isVector()) { + ElementCount NumElems = VT.getVectorElementCount(); + EVT CondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElems); + + if (VT.isScalableVector()) { + Cond = DAG.getSplatVector(CondVT, DL, Cond); + } else { + Cond = DAG.getSplatBuildVector(CondVT, DL, Cond); + } + } + + if (VT.isFloatingPoint()) { + if (VT.isVector()) { + // float vector -> int vector + EVT ElemVT = VT.getVectorElementType(); + unsigned int ElemBitWidth = ElemVT.getScalarSizeInBits(); + EVT IntElemVT = EVT::getIntegerVT(*DAG.getContext(), ElemBitWidth); + + WorkingVT = EVT::getVectorVT(*DAG.getContext(), IntElemVT, + VT.getVectorElementCount()); + } else { + WorkingVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); + } + + WorkingT = DAG.getBitcast(WorkingVT, T); + WorkingF = DAG.getBitcast(WorkingVT, F); + } + + SDValue Mask = DAG.getSExtOrTrunc(Cond, DL, WorkingVT); + + SDValue AllOnes; + if (WorkingVT.isScalableVector()) { + unsigned BitWidth = WorkingVT.getScalarSizeInBits(); + APInt AllOnesVal = APInt::getAllOnes(BitWidth); + SDValue ScalarAllOnes = + DAG.getConstant(AllOnesVal, DL, WorkingVT.getScalarType()); + AllOnes = DAG.getSplatVector(WorkingVT, DL, ScalarAllOnes); + } else { + AllOnes = DAG.getAllOnesConstant(DL, WorkingVT); + } + + SDValue Invert = + DAG.getNode(ISD::XOR, DL, WorkingVT, Mask, AllOnes, ProtectedFlag); + + // (or (and WorkingT, Mask), (and F, ~Mask)) + SDValue TM = + DAG.getNode(ISD::AND, DL, WorkingVT, Mask, WorkingT, ProtectedFlag); + SDValue FM = + DAG.getNode(ISD::AND, DL, WorkingVT, Invert, WorkingF, ProtectedFlag); + SDValue Result = DAG.getNode(ISD::OR, DL, WorkingVT, TM, FM, ProtectedFlag); + + // Convert back if needed + if (WorkingVT != VT) { + Result = DAG.getBitcast(VT, Result); + } + + return Result; +} + /// Lower the call to the specified intrinsic function. void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { @@ -6667,6 +6833,53 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, updateDAGForMaybeTailCall(MC); return; } + case Intrinsic::ct_select: { + SDLoc DL = getCurSDLoc(); + + SDValue Cond = getValue(I.getArgOperand(0)); // i1 + SDValue A = getValue(I.getArgOperand(1)); // T + SDValue B = getValue(I.getArgOperand(2)); // T + + assert((A.getValueType() == B.getValueType()) && + "Operands are of different types"); + + EVT VT = A.getValueType(); + EVT CondVT = Cond.getValueType(); + + // For now we'll only support scalar predicates + // assert if Cond type is Vector + // TODO: Maybe look into supporting vector predicates? + if (CondVT.isVector()) { + report_fatal_error( + "llvm.ct.select: predicates with vector types not supported yet"); + } + + // Set function attribute to indicate ct.select usage + Function &F = DAG.getMachineFunction().getFunction(); + F.addFnAttr("ct-select"); + + // Handle scalar types + if (TLI.isSelectSupported( + TargetLoweringBase::SelectSupportKind::CtSelect) && + !CondVT.isVector()) { + SDValue Result = DAG.getNode(ISD::CTSELECT, DL, VT, Cond, A, B); + setValue(&I, Result); + return; + } + + // We don't support non-integral pointers + Type *CurrType = VT.getTypeForEVT(*Context); + if (CurrType->isPointerTy()) { + unsigned AS = CurrType->getPointerAddressSpace(); + if (DAG.getDataLayout().isNonIntegralAddressSpace(AS)) { + report_fatal_error( + "llvm.ct.select: non-integral pointers are not supported"); + } + } + + setValue(&I, createProtectedCtSelectFallbackChain(DAG, DL, Cond, A, B, VT)); + return; + } case Intrinsic::call_preallocated_setup: { const CallBase *PreallocatedCall = FindPreallocatedCall(&I); SDValue SrcValue = DAG.getSrcValue(PreallocatedCall); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index c7577fa335feb..6068818a32656 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -214,6 +214,12 @@ class SelectionDAGBuilder { peelDominantCaseCluster(const SwitchInst &SI, SwitchCG::CaseClusterVector &Clusters, BranchProbability &PeeledCaseProb); + SDValue createProtectedCtSelectFallbackChain(SelectionDAG &DAG, + const SDLoc &DL, SDValue Cond, + SDValue T, SDValue F, EVT VT); + SDValue createProtectedCtSelectFallbackNoMerge(SelectionDAG &DAG, + const SDLoc &DL, SDValue Cond, + SDValue T, SDValue F, EVT VT); private: const TargetMachine &TM; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 39cbfad6d0be1..274a1cd4f7594 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -332,6 +332,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::FPOWI: return "fpowi"; case ISD::STRICT_FPOWI: return "strict_fpowi"; case ISD::SETCC: return "setcc"; + case ISD::CTSELECT: return "ctselect"; case ISD::SETCCCARRY: return "setcccarry"; case ISD::STRICT_FSETCC: return "strict_fsetcc"; case ISD::STRICT_FSETCCS: return "strict_fsetccs"; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 662d84b7a60a8..89e949d96146e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -511,12 +511,35 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::BR_CC, MVT::f64, Custom); setOperationAction(ISD::SELECT, MVT::i32, Custom); setOperationAction(ISD::SELECT, MVT::i64, Custom); + setOperationAction(ISD::CTSELECT, MVT::i8, Promote); + setOperationAction(ISD::CTSELECT, MVT::i16, Promote); + setOperationAction(ISD::CTSELECT, MVT::i32, Custom); + setOperationAction(ISD::CTSELECT, MVT::i64, Custom); if (Subtarget->hasFPARMv8()) { setOperationAction(ISD::SELECT, MVT::f16, Custom); setOperationAction(ISD::SELECT, MVT::bf16, Custom); } + if (Subtarget->hasFullFP16()) { + setOperationAction(ISD::CTSELECT, MVT::f16, Custom); + setOperationAction(ISD::CTSELECT, MVT::bf16, Custom); + } else { + setOperationAction(ISD::CTSELECT, MVT::f16, Promote); + setOperationAction(ISD::CTSELECT, MVT::bf16, Promote); + } setOperationAction(ISD::SELECT, MVT::f32, Custom); setOperationAction(ISD::SELECT, MVT::f64, Custom); + setOperationAction(ISD::CTSELECT, MVT::f32, Custom); + setOperationAction(ISD::CTSELECT, MVT::f64, Custom); + for (MVT VT : MVT::vector_valuetypes()) { + MVT elemType = VT.getVectorElementType(); + if (elemType == MVT::i8 || elemType == MVT::i16) { + setOperationAction(ISD::CTSELECT, VT, Promote); + } else if ((elemType == MVT::f16 || elemType == MVT::bf16) && !Subtarget->hasFullFP16()) { + setOperationAction(ISD::CTSELECT, VT, Promote); + } else { + setOperationAction(ISD::CTSELECT, VT, Expand); + } + } setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); @@ -3328,6 +3351,18 @@ void AArch64TargetLowering::fixupPtrauthDiscriminator( IntDiscOp.setImm(IntDisc); } +MachineBasicBlock *AArch64TargetLowering::EmitCTSELECT(MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + MachineInstrBuilder Builder = BuildMI(*MBB, MI, DL, TII->get(Opcode)); + for (unsigned Idx = 0; Idx < MI.getNumOperands(); ++Idx) { + Builder.add(MI.getOperand(Idx)); + } + Builder->setFlag(MachineInstr::NoMerge); + MBB->remove_instr(&MI); + return MBB; +} + MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { @@ -7590,6 +7625,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerSELECT(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + case ISD::CTSELECT: + return LowerCTSELECT(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::BR_JT: @@ -12146,6 +12183,22 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, return Res; } +SDValue AArch64TargetLowering::LowerCTSELECT(SDValue Op, + SelectionDAG &DAG) const { + SDValue CCVal = Op->getOperand(0); + SDValue TVal = Op->getOperand(1); + SDValue FVal = Op->getOperand(2); + SDLoc DL(Op); + + EVT VT = Op.getValueType(); + + SDValue Zero = DAG.getConstant(0, DL, CCVal.getValueType()); + SDValue CC; + SDValue Cmp = getAArch64Cmp(CCVal, Zero, ISD::SETNE, CC, DAG, DL); + + return DAG.getNode(AArch64ISD::CTSELECT, DL, VT, TVal, FVal, CC, Cmp); +} + SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { // Jump table entries as PC relative offsets. No additional tweaking diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 9495c9ffc47aa..415360ea57adf 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -23,6 +23,11 @@ namespace llvm { +namespace AArch64ISD { +// Forward declare the enum from the generated file +enum GenNodeType : unsigned; +} // namespace AArch64ISD + class AArch64TargetMachine; namespace AArch64 { @@ -202,6 +207,8 @@ class AArch64TargetLowering : public TargetLowering { MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const; + MachineBasicBlock *EmitCTSELECT(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode) const; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; @@ -685,6 +692,7 @@ class AArch64TargetLowering : public TargetLowering { iterator_range Users, SDNodeFlags Flags, const SDLoc &dl, SelectionDAG &DAG) const; + SDValue LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; @@ -920,6 +928,10 @@ class AArch64TargetLowering : public TargetLowering { bool hasMultipleConditionRegisters(EVT VT) const override { return VT.isScalableVector(); } + + bool isSelectSupported(SelectSupportKind Kind) const override { + return true; + } }; namespace AArch64 { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 12c600f0f2661..7b3fbc64ada36 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2113,16 +2113,46 @@ bool AArch64InstrInfo::removeCmpToZeroOrOne( return true; } -bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { - if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD && - MI.getOpcode() != AArch64::CATCHRET) - return false; +static inline void expandCtSelect(MachineBasicBlock &MBB, MachineInstr &MI, DebugLoc &DL, const MCInstrDesc &MCID) { + MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, MCID); + for (unsigned Idx = 0; Idx < MI.getNumOperands(); ++Idx) { + Builder.add(MI.getOperand(Idx)); + } + Builder->setFlag(MachineInstr::NoMerge); + MBB.remove_instr(&MI); +} +bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MachineBasicBlock &MBB = *MI.getParent(); auto &Subtarget = MBB.getParent()->getSubtarget(); auto TRI = Subtarget.getRegisterInfo(); DebugLoc DL = MI.getDebugLoc(); + switch (MI.getOpcode()) { + case AArch64::I32CTSELECT: + expandCtSelect(MBB, MI, DL, get(AArch64::CSELWr)); + return true; + case AArch64::I64CTSELECT: + expandCtSelect(MBB, MI, DL, get(AArch64::CSELXr)); + return true; + case AArch64::BF16CTSELECT: + expandCtSelect(MBB, MI, DL, get(AArch64::FCSELHrrr)); + return true; + case AArch64::F16CTSELECT: + expandCtSelect(MBB, MI, DL, get(AArch64::FCSELHrrr)); + return true; + case AArch64::F32CTSELECT: + expandCtSelect(MBB, MI, DL, get(AArch64::FCSELSrrr)); + return true; + case AArch64::F64CTSELECT: + expandCtSelect(MBB, MI, DL, get(AArch64::FCSELDrrr)); + return true; + } + + if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD && + MI.getOpcode() != AArch64::CATCHRET) + return false; + if (MI.getOpcode() == AArch64::CATCHRET) { // Skip to the first instruction before the epilog. const TargetInstrInfo *TII = diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index f788c7510f80c..64de1674b494d 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -464,6 +464,11 @@ def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>; def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisVT<2, OtherVT>]>; +def SDT_AArch64CtSelect : SDTypeProfile<1, 4, + [SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisInt<3>, + SDTCisVT<4, i32>]>; def SDT_AArch64CSel : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, @@ -831,6 +836,7 @@ def AArch64tbz : SDNode<"AArch64ISD::TBZ", SDT_AArch64tbz, def AArch64tbnz : SDNode<"AArch64ISD::TBNZ", SDT_AArch64tbz, [SDNPHasChain]>; +def AArch64ctselect : SDNode<"AArch64ISD::CTSELECT", SDT_AArch64CtSelect>; def AArch64csel : SDNode<"AArch64ISD::CSEL", SDT_AArch64CSel>; // Conditional select invert. @@ -5683,6 +5689,45 @@ def F128CSEL : Pseudo<(outs FPR128:$Rd), let hasNoSchedulingInfo = 1; } +//===----------------------------------------------------------------------===// +// Constant-time conditional selection instructions +//===----------------------------------------------------------------------===// + +let hasSideEffects = 1, isPseudo = 1, hasNoSchedulingInfo = 1, Uses = [NZCV] in { + def I32CTSELECT : Pseudo<(outs GPR32:$dst), + (ins GPR32:$tval, GPR32:$fval, i32imm:$cc), + [(set (i32 GPR32:$dst), + (AArch64ctselect GPR32:$tval, GPR32:$fval, + (i32 imm:$cc), NZCV))]>; + def I64CTSELECT : Pseudo<(outs GPR64:$dst), + (ins GPR64:$tval, GPR64:$fval, i32imm:$cc), + [(set (i64 GPR64:$dst), + (AArch64ctselect GPR64:$tval, GPR64:$fval, + (i32 imm:$cc), NZCV))]>; + let Predicates = [HasFullFP16] in { + def F16CTSELECT : Pseudo<(outs FPR16:$dst), + (ins FPR16:$tval, FPR16:$fval, i32imm:$cc), + [(set (f16 FPR16:$dst), + (AArch64ctselect (f16 FPR16:$tval), (f16 FPR16:$fval), + (i32 imm:$cc), NZCV))]>; + def BF16CTSELECT : Pseudo<(outs FPR16:$dst), + (ins FPR16:$tval, FPR16:$fval, i32imm:$cc), + [(set (bf16 FPR16:$dst), + (AArch64ctselect (bf16 FPR16:$tval), (bf16 FPR16:$fval), + (i32 imm:$cc), NZCV))]>; + } + def F32CTSELECT : Pseudo<(outs FPR32:$dst), + (ins FPR32:$tval, FPR32:$fval, i32imm:$cc), + [(set (f32 FPR32:$dst), + (AArch64ctselect FPR32:$tval, FPR32:$fval, + (i32 imm:$cc), NZCV))]>; + def F64CTSELECT : Pseudo<(outs FPR64:$dst), + (ins FPR64:$tval, FPR64:$fval, i32imm:$cc), + [(set (f64 FPR64:$dst), + (AArch64ctselect FPR64:$tval, FPR64:$fval, + (i32 imm:$cc), NZCV))]>; +} + //===----------------------------------------------------------------------===// // Instructions used for emitting unwind opcodes on ARM64 Windows. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp index 39946633603f6..e2ec9118eb5ee 100644 --- a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -393,5 +393,23 @@ void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.setOpcode(AArch64::RET); OutMI.addOperand(MCOperand::createReg(AArch64::LR)); break; + case AArch64::I32CTSELECT: + OutMI.setOpcode(AArch64::CSELWr); + break; + case AArch64::I64CTSELECT: + OutMI.setOpcode(AArch64::CSELXr); + break; + case AArch64::BF16CTSELECT: + OutMI.setOpcode(AArch64::FCSELHrrr); + break; + case AArch64::F16CTSELECT: + OutMI.setOpcode(AArch64::FCSELHrrr); + break; + case AArch64::F32CTSELECT: + OutMI.setOpcode(AArch64::FCSELSrrr); + break; + case AArch64::F64CTSELECT: + OutMI.setOpcode(AArch64::FCSELDrrr); + break; } } diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 22769dbf38719..fa10c00526cf7 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1526,18 +1526,340 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const { BB->erase(MI); } +// Expands the ctselect pseudo for vector operands, post-RA. +bool ARMBaseInstrInfo::expandCtSelectVector(MachineInstr &MI) const { + MachineBasicBlock *MBB = MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + Register DestReg = MI.getOperand(0).getReg(); + Register MaskReg = MI.getOperand(1).getReg(); + + // These operations will differ by operand register size. + unsigned AndOp = ARM::VANDd; + unsigned BicOp = ARM::VBICd; + unsigned OrrOp = ARM::VORRd; + unsigned BroadcastOp = ARM::VDUP32d; + + const TargetRegisterInfo *TRI = &getRegisterInfo(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(DestReg); + + if (ARM::QPRRegClass.hasSubClassEq(RC)) { + AndOp = ARM::VANDq; + BicOp = ARM::VBICq; + OrrOp = ARM::VORRq; + BroadcastOp = ARM::VDUP32q; + } + + unsigned RsbOp = Subtarget.isThumb2() ? ARM::t2RSBri : ARM::RSBri; + + // Any vector pseudo has: ((outs $dst, $tmp_mask, $bcast_mask), (ins $src1, $src2, $cond)) + Register VectorMaskReg = MI.getOperand(2).getReg(); + Register Src1Reg = MI.getOperand(3).getReg(); + Register Src2Reg = MI.getOperand(4).getReg(); + Register CondReg = MI.getOperand(5).getReg(); + + // The following sequence of steps yields: (src1 & mask) | (src2 & ~mask) + + // 1. mask = 0 - cond + // When cond = 0: mask = 0x00000000. + // When cond = 1: mask = 0xFFFFFFFF. + + MachineInstr *FirstNewMI = + BuildMI(*MBB, MI, DL, get(RsbOp), MaskReg) + .addReg(CondReg) + .addImm(0) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // 2. A = src1 & mask + // For vectors, broadcast the scalar mask so it matches operand size. + BuildMI(*MBB, MI, DL, get(BroadcastOp), VectorMaskReg) + .addReg(MaskReg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + BuildMI(*MBB, MI, DL, get(AndOp), DestReg) + .addReg(Src1Reg) + .addReg(VectorMaskReg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // 3. B = src2 & ~mask + BuildMI(*MBB, MI, DL, get(BicOp), VectorMaskReg) + .addReg(Src2Reg) + .addReg(VectorMaskReg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // 4. result = A | B + auto LastNewMI = BuildMI(*MBB, MI, DL, get(OrrOp), DestReg) + .addReg(DestReg) + .addReg(VectorMaskReg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + auto BundleStart = FirstNewMI->getIterator(); + auto BundleEnd = LastNewMI->getIterator(); + + // Add instruction bundling + finalizeBundle(*MBB, BundleStart, std::next(BundleEnd)); + + MI.eraseFromParent(); + return true; +} + +// Expands the ctselect pseudo for thumb1, post-RA. +bool ARMBaseInstrInfo::expandCtSelectThumb(MachineInstr &MI) const { + MachineBasicBlock *MBB = MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + // pseudos in thumb1 mode have: (outs $dst, $tmp_mask), (ins $src1, $src2, $cond)) + // register class here is always tGPR. + Register DestReg = MI.getOperand(0).getReg(); + Register MaskReg = MI.getOperand(1).getReg(); + Register Src1Reg = MI.getOperand(2).getReg(); + Register Src2Reg = MI.getOperand(3).getReg(); + Register CondReg = MI.getOperand(4).getReg(); + + // Access register info + MachineFunction *MF = MBB->getParent(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + unsigned RegSize = TRI->getRegSizeInBits(MaskReg, MRI); + unsigned ShiftAmount = RegSize - 1; + + // Option 1: Shift-based mask (preferred - no flag modification) + MachineInstr *FirstNewMI = + BuildMI(*MBB, MI, DL, get(ARM::tMOVr), MaskReg) + .addReg(CondReg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Instead of using RSB, we can use LSL and ASR to get the mask. This is to avoid the flag modification caused by RSB. + BuildMI(*MBB, MI, DL, get(ARM::tLSLri), MaskReg) + .addReg(MaskReg) + .addImm(ShiftAmount) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + BuildMI(*MBB, MI, DL, get(ARM::tASRri), MaskReg) + .addReg(MaskReg) + .addImm(ShiftAmount) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // 2. xor_diff = src1 ^ src2 + BuildMI(*MBB, MI, DL, get(ARM::tMOVr), DestReg) + .addReg(Src1Reg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + BuildMI(*MBB, MI, DL, get(ARM::tEOR), DestReg) + .addReg(DestReg) + .addReg(Src2Reg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // 3. masked_xor = xor_diff & mask + BuildMI(*MBB, MI, DL, get(ARM::tAND), DestReg) + .addReg(DestReg) + .addReg(MaskReg, RegState::Kill) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // 4. result = src2 ^ masked_xor + auto LastMI = BuildMI(*MBB, MI, DL, get(ARM::tEOR), DestReg) + .addReg(DestReg) + .addReg(Src2Reg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Add instruction bundling + auto BundleStart = FirstNewMI->getIterator(); + finalizeBundle(*MBB, BundleStart, std::next(LastMI->getIterator())); + + MI.eraseFromParent(); + return true; +} + +// Expands the ctselect pseudo, post-RA. +bool ARMBaseInstrInfo::expandCtSelect(MachineInstr &MI) const { + MachineBasicBlock *MBB = MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + Register DestReg = MI.getOperand(0).getReg(); + Register MaskReg = MI.getOperand(1).getReg(); + Register DestRegSavedRef = DestReg; + Register Src1Reg, Src2Reg, CondReg; + + // These operations will differ by operand register size. + unsigned RsbOp = ARM::RSBri; + unsigned AndOp = ARM::ANDrr; + unsigned BicOp = ARM::BICrr; + unsigned OrrOp = ARM::ORRrr; + + if (Subtarget.isThumb2()) { + RsbOp = ARM::t2RSBri; + AndOp = ARM::t2ANDrr; + BicOp = ARM::t2BICrr; + OrrOp = ARM::t2ORRrr; + } + + unsigned Opcode = MI.getOpcode(); + bool IsFloat = Opcode == ARM::CTSELECTf32 || Opcode == ARM::CTSELECTf16 || Opcode == ARM::CTSELECTbf16; + MachineInstr *FirstNewMI = nullptr; + if (IsFloat) { + // Each float pseudo has: (outs $dst, $tmp_mask, $scratch1, $scratch2), (ins $src1, $src2, $cond)) + // We use two scratch registers in tablegen for bitwise ops on float types,. + Register GPRScratch1 = MI.getOperand(2).getReg(); + Register GPRScratch2 = MI.getOperand(3).getReg(); + + // choice a from __builtin_ct_select(cond, a, b) + Src1Reg = MI.getOperand(4).getReg(); + // choice b from __builtin_ct_select(cond, a, b) + Src2Reg = MI.getOperand(5).getReg(); + // cond from __builtin_ct_select(cond, a, b) + CondReg = MI.getOperand(6).getReg(); + + // Move fp src1 to GPR scratch1 so we can do our bitwise ops + FirstNewMI = BuildMI(*MBB, MI, DL, get(ARM::VMOVRS), GPRScratch1) + .addReg(Src1Reg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Move src2 to scratch2 + BuildMI(*MBB, MI, DL, get(ARM::VMOVRS), GPRScratch2) + .addReg(Src2Reg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + Src1Reg = GPRScratch1; + Src2Reg = GPRScratch2; + // Reuse GPRScratch1 for dest after we are done working with src1. + DestReg = GPRScratch1; + } else { + // Any non-float, non-vector pseudo has: (outs $dst, $tmp_mask), (ins $src1, $src2, $cond)) + Src1Reg = MI.getOperand(2).getReg(); + Src2Reg = MI.getOperand(3).getReg(); + CondReg = MI.getOperand(4).getReg(); + } + + // The following sequence of steps yields: (src1 & mask) | (src2 & ~mask) + + // 1. mask = 0 - cond + // When cond = 0: mask = 0x00000000. + // When cond = 1: mask = 0xFFFFFFFF. + auto TmpNewMI = BuildMI(*MBB, MI, DL, get(RsbOp), MaskReg) + .addReg(CondReg) + .addImm(0) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // We use the first instruction in the bundle as the first instruction. + if (!FirstNewMI) + FirstNewMI = TmpNewMI; + + // 2. A = src1 & mask + BuildMI(*MBB, MI, DL, get(AndOp), DestReg) + .addReg(Src1Reg) + .addReg(MaskReg) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // 3. B = src2 & ~mask + BuildMI(*MBB, MI, DL, get(BicOp), MaskReg) + .addReg(Src2Reg) + .addReg(MaskReg) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // 4. result = A | B + auto LastNewMI = BuildMI(*MBB, MI, DL, get(OrrOp), DestReg) + .addReg(DestReg) + .addReg(MaskReg) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + if (IsFloat) { + // Return our result from GPR to the correct register type. + LastNewMI =BuildMI(*MBB, MI, DL, get(ARM::VMOVSR), DestRegSavedRef) + .addReg(DestReg) + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + } + + auto BundleStart = FirstNewMI->getIterator(); + auto BundleEnd = LastNewMI->getIterator(); + + // Add instruction bundling + finalizeBundle(*MBB, BundleStart, std::next(BundleEnd)); + + MI.eraseFromParent(); + return true; +} + bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { - if (MI.getOpcode() == TargetOpcode::LOAD_STACK_GUARD) { + auto opcode = MI.getOpcode(); + + if (opcode == TargetOpcode::LOAD_STACK_GUARD) { expandLoadStackGuard(MI); MI.getParent()->erase(MI); return true; } - if (MI.getOpcode() == ARM::MEMCPY) { + if (opcode == ARM::MEMCPY) { expandMEMCPY(MI); return true; } + if (opcode == ARM::CTSELECTf64) { + if (Subtarget.isThumb1Only()) { + LLVM_DEBUG(dbgs() << "Opcode (thumb1 subtarget) " << opcode << "replaced by: " << MI); + return expandCtSelectThumb(MI); + } else { + LLVM_DEBUG(dbgs() << "Opcode (vector) " << opcode << "replaced by: " << MI); + return expandCtSelectVector(MI); + } + } + + if (opcode == ARM::CTSELECTv8i8 || + opcode == ARM::CTSELECTv4i16 || + opcode == ARM::CTSELECTv2i32 || + opcode == ARM::CTSELECTv1i64 || + opcode == ARM::CTSELECTv2f32 || + opcode == ARM::CTSELECTv4f16 || + opcode == ARM::CTSELECTv4bf16 || + opcode == ARM::CTSELECTv16i8 || + opcode == ARM::CTSELECTv8i16 || + opcode == ARM::CTSELECTv4i32 || + opcode == ARM::CTSELECTv2i64 || + opcode == ARM::CTSELECTv4f32 || + opcode == ARM::CTSELECTv2f64 || + opcode == ARM::CTSELECTv8f16 || + opcode == ARM::CTSELECTv8bf16) { + LLVM_DEBUG(dbgs() << "Opcode (vector) " << opcode << "replaced by: " << MI); + return expandCtSelectVector(MI); + } + + if (opcode == ARM::CTSELECTint || + opcode == ARM::CTSELECTf16 || + opcode == ARM::CTSELECTbf16 || + opcode == ARM::CTSELECTf32) { + if (Subtarget.isThumb1Only()) { + LLVM_DEBUG(dbgs() << "Opcode (thumb1 subtarget) " << opcode << "replaced by: " << MI); + return expandCtSelectThumb(MI); + } else { + LLVM_DEBUG(dbgs() << "Opcode " << opcode << "replaced by: " << MI); + return expandCtSelect(MI); + } + } + // This hook gets to expand COPY instructions before they become // copyPhysReg() calls. Look for VMOVS instructions that can legally be // widened to VMOVD. We prefer the VMOVD when possible because it may be diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index 2869e7f708046..f0e090f09f5dc 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -221,6 +221,12 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; + bool expandCtSelectVector(MachineInstr &MI) const; + + bool expandCtSelectThumb(MachineInstr &MI) const; + + bool expandCtSelect(MachineInstr &MI) const; + bool expandPostRAPseudo(MachineInstr &MI) const override; bool shouldSink(const MachineInstr &MI) const override; diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 847b7af5a9b11..62f5b21a738dd 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -4200,6 +4200,92 @@ void ARMDAGToDAGISel::Select(SDNode *N) { // Other cases are autogenerated. break; } + case ARMISD::CTSELECT: { + EVT VT = N->getValueType(0); + unsigned PseudoOpcode; + bool IsFloat = false; + bool IsVector = false; + + if (VT == MVT::f16) { + PseudoOpcode = ARM::CTSELECTf16; + IsFloat = true; + } else if (VT == MVT::bf16) { + PseudoOpcode = ARM::CTSELECTbf16; + IsFloat = true; + } else if (VT == MVT::f32) { + PseudoOpcode = ARM::CTSELECTf32; + IsFloat = true; + } else if (VT == MVT::f64) { + PseudoOpcode = ARM::CTSELECTf64; + IsVector = true; + } else if (VT == MVT::v8i8) { + PseudoOpcode = ARM::CTSELECTv8i8; + IsVector = true; + } else if (VT == MVT::v4i16) { + PseudoOpcode = ARM::CTSELECTv4i16; + IsVector = true; + } else if (VT == MVT::v2i32) { + PseudoOpcode = ARM::CTSELECTv2i32; + IsVector = true; + } else if (VT == MVT::v1i64) { + PseudoOpcode = ARM::CTSELECTv1i64; + IsVector = true; + } else if (VT == MVT::v2f32) { + PseudoOpcode = ARM::CTSELECTv2f32; + IsVector = true; + } else if (VT == MVT::v4f16) { + PseudoOpcode = ARM::CTSELECTv4f16; + IsVector = true; + } else if (VT == MVT::v4bf16) { + PseudoOpcode = ARM::CTSELECTv4bf16; + IsVector = true; + } else if (VT == MVT::v16i8) { + PseudoOpcode = ARM::CTSELECTv16i8; + IsVector = true; + } else if (VT == MVT::v8i16) { + PseudoOpcode = ARM::CTSELECTv8i16; + IsVector = true; + } else if (VT == MVT::v4i32) { + PseudoOpcode = ARM::CTSELECTv4i32; + IsVector = true; + } else if (VT == MVT::v2i64) { + PseudoOpcode = ARM::CTSELECTv2i64; + IsVector = true; + } else if (VT == MVT::v4f32) { + PseudoOpcode = ARM::CTSELECTv4f32; + IsVector = true; + } else if (VT == MVT::v2f64) { + PseudoOpcode = ARM::CTSELECTv2f64; + IsVector = true; + } else if (VT == MVT::v8f16) { + PseudoOpcode = ARM::CTSELECTv8f16; + IsVector = true; + } else if (VT == MVT::v8bf16) { + PseudoOpcode = ARM::CTSELECTv8bf16; + IsVector = true; + } else { + // i1, i8, i16, i32, i64 + PseudoOpcode = ARM::CTSELECTint; + } + + SmallVector VTs; + VTs.push_back(VT); // $dst + VTs.push_back(MVT::i32); // $tmp_mask (always GPR) + + if (IsVector) { + VTs.push_back(VT); // $bcast_mask (same type as dst for vectors) + } else if (IsFloat) { + VTs.push_back(MVT::i32); // $scratch1 (GPR) + VTs.push_back(MVT::i32); // $scratch2 (GPR) + } + + // src1, src2, cond + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) }; + + SDNode *ResNode = CurDAG->getMachineNode(PseudoOpcode, SDLoc(N), VTs, Ops); + ReplaceNode(N, ResNode); + return; + } case ARMISD::VZIP: { EVT VT = N->getValueType(0); // vzip.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm. diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 67ea2dd3df792..c5729aa990bf6 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -203,6 +203,7 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) { setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); if (VT.isInteger()) { setOperationAction(ISD::SHL, VT, Custom); @@ -304,6 +305,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::CTSELECT, VT, Custom); // Vector reductions setOperationAction(ISD::VECREDUCE_ADD, VT, Legal); @@ -355,6 +357,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::MSTORE, VT, Legal); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::CTSELECT, VT, Custom); // Pre and Post inc are supported on loads and stores for (unsigned im = (unsigned)ISD::PRE_INC; @@ -408,6 +411,28 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom); setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom); + if (Subtarget->hasFullFP16()) { + setOperationAction(ISD::CTSELECT, MVT::v4f16, Custom); + setOperationAction(ISD::CTSELECT, MVT::v8f16, Custom); + } + + if (Subtarget->hasBF16()) { + setOperationAction(ISD::CTSELECT, MVT::v4bf16, Custom); + setOperationAction(ISD::CTSELECT, MVT::v8bf16, Custom); + } + + // small exotic vectors get scalarised for ctselect + setOperationAction(ISD::CTSELECT, MVT::v1i8, Expand); + setOperationAction(ISD::CTSELECT, MVT::v1i16, Expand); + setOperationAction(ISD::CTSELECT, MVT::v1i32, Expand); + setOperationAction(ISD::CTSELECT, MVT::v1f32, Expand); + setOperationAction(ISD::CTSELECT, MVT::v2i8, Expand); + + setOperationAction(ISD::CTSELECT, MVT::v2i16, Promote); + setOperationPromotedToType(ISD::CTSELECT, MVT::v2i16, MVT::v4i16); + setOperationAction(ISD::CTSELECT, MVT::v4i8, Promote); + setOperationPromotedToType(ISD::CTSELECT, MVT::v4i8, MVT::v8i8); + // We 'support' these types up to bitcast/load/store level, regardless of // MVE integer-only / float support. Only doing FP data processing on the FP // vector types is inhibited at integer-only level. @@ -419,6 +444,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); } setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); @@ -474,6 +500,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::CTSELECT, VT, Custom); if (!HasMVEFP) { setOperationAction(ISD::SINT_TO_FP, VT, Expand); @@ -1237,10 +1264,27 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); + setOperationAction(ISD::CTSELECT, MVT::i8, Promote); + setOperationAction(ISD::CTSELECT, MVT::i16, Promote); + setOperationPromotedToType(ISD::CTSELECT, MVT::i16, MVT::i32); + + setOperationAction(ISD::CTSELECT, MVT::i32, Custom); + setOperationAction(ISD::CTSELECT, MVT::i64, Expand); + setOperationAction(ISD::CTSELECT, MVT::f32, Custom); + setOperationAction(ISD::CTSELECT, MVT::f64, Custom); + + // Handle f16 and bf16 without falling back to select from ctselect. + setTargetDAGCombine({ISD::CTSELECT}); + if (Subtarget->hasFullFP16()) { setOperationAction(ISD::SETCC, MVT::f16, Expand); setOperationAction(ISD::SELECT, MVT::f16, Custom); setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); + setOperationAction(ISD::CTSELECT, MVT::f16, Custom); + } + + if (Subtarget->hasBF16()) { + setOperationAction(ISD::CTSELECT, MVT::bf16, Custom); } setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom); @@ -1567,6 +1611,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(ARMISD::BCC_i64) MAKE_CASE(ARMISD::FMSTAT) MAKE_CASE(ARMISD::CMOV) + MAKE_CASE(ARMISD::CTSELECT) MAKE_CASE(ARMISD::SSAT) MAKE_CASE(ARMISD::USAT) MAKE_CASE(ARMISD::ASRL) @@ -5103,6 +5148,20 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SelectTrue, SelectFalse, ISD::SETNE); } +SDValue ARMTargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + + SDValue Cond = Op.getOperand(0); + SDValue TrueVal = Op.getOperand(1); + SDValue FalseVal = Op.getOperand(2); + EVT VT = Op.getValueType(); + + // Normalise the condition to 0 or 1. + SDValue One = DAG.getConstant(1, DL, MVT::i32); + SDValue CondNode = DAG.getNode(ISD::AND, DL, MVT::i32, Cond, One); + return DAG.getNode(ARMISD::CTSELECT, DL, VT, TrueVal, FalseVal, CondNode); +} + static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, bool &swpCmpOps, bool &swpVselOps) { // Start by selecting the GE condition code for opcodes that return true for @@ -10599,6 +10658,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + case ISD::CTSELECT: return LowerCTSELECT(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::BR_CC: return LowerBR_CC(Op, DAG); case ISD::BR_JT: return LowerBR_JT(Op, DAG); @@ -10815,6 +10875,36 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, case ISD::FP_TO_UINT_SAT: Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget); break; + case ISD::CTSELECT: { + EVT VT = N->getValueType(0); + + // Handle f16/bf16 type promotion while preserving ctselect + if (VT == MVT::f16 || VT == MVT::bf16) { + SDLoc DL(N); + SDValue Cond = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue FalseVal = N->getOperand(2); + + // Bitcast to i16, then promote to i32 + SDValue TrueInt = DAG.getBitcast(MVT::i16, TrueVal); + SDValue FalseInt = DAG.getBitcast(MVT::i16, FalseVal); + + TrueInt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, TrueInt); + FalseInt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, FalseInt); + + // Normalize condition + SDValue One = DAG.getConstant(1, DL, MVT::i32); + SDValue CondNorm = DAG.getNode(ISD::AND, DL, MVT::i32, Cond, One); + + // Create i32 ctselect that will go through normal lowering + Res = DAG.getNode(ISD::CTSELECT, DL, MVT::i32, + CondNorm, TrueInt, FalseInt); + } else { + // For other types, use existing lowering + Res = LowerCTSELECT(SDValue(N, 0), DAG); + } + break; + } } if (Res.getNode()) Results.push_back(Res); @@ -13371,6 +13461,63 @@ static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG) { DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts)); } +static SDValue PerformCTSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + if (!DCI.isBeforeLegalize()) { + return SDValue(); + } + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + EVT VT = N->getValueType(0); + if (VT == MVT::f16 || VT == MVT::bf16) { + SDValue Cond = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue FalseVal = N->getOperand(2); + + SDValue TrueInt = DAG.getBitcast(MVT::i16, TrueVal); + SDValue FalseInt = DAG.getBitcast(MVT::i16, FalseVal); + + // Create i16 ctselect - this will be promoted to i32 ctselect naturally + SDValue Result = DAG.getNode(ISD::CTSELECT, DL, MVT::i16, + Cond, TrueInt, FalseInt); + + return DAG.getBitcast(VT, Result); + } else if (VT.isVector()) { + EVT EltVT = VT.getVectorElementType(); + if (EltVT == MVT::f16 || EltVT == MVT::bf16) { + SDValue Cond = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue FalseVal = N->getOperand(2); + + EVT IntVT; + switch (VT.getSimpleVT().SimpleTy) { + case MVT::v4f16: + case MVT::v4bf16: + IntVT = MVT::v4i16; + break; + case MVT::v8f16: + case MVT::v8bf16: + IntVT = MVT::v8i16; + break; + default: + return SDValue(); // Unsupported vector type + } + + SDValue TrueInt = DAG.getBitcast(IntVT, TrueVal); + SDValue FalseInt = DAG.getBitcast(IntVT, FalseVal); + + SDValue Result = DAG.getNode(ISD::CTSELECT, DL, IntVT, + Cond, TrueInt, FalseInt); + + return DAG.getBitcast(VT, Result); + } + } + + return SDValue(); +} + static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { @@ -18874,6 +19021,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SELECT_CC: case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget); case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget); + case ISD::CTSELECT: return PerformCTSELECTCombine(N, DCI, Subtarget); case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget); case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 70aa001a41885..5ca1769087873 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -97,6 +97,9 @@ class VectorType; CMOV, // ARM conditional move instructions. + CTSELECT, // ARM constant-time select, implemented with constant-time + // bitwise arithmetic instructions. + SSAT, // Signed saturation USAT, // Unsigned saturation @@ -430,8 +433,12 @@ class VectorType; const char *getTargetNodeName(unsigned Opcode) const override; bool isSelectSupported(SelectSupportKind Kind) const override { - // ARM does not support scalar condition selects on vectors. - return (Kind != ScalarCondVectorVal); + if (Kind == SelectSupportKind::CtSelect) { + return true; + } else { + // ARM does not support scalar condition selects on vectors. + return (Kind != SelectSupportKind::ScalarCondVectorVal); + } } bool isReadOnly(const GlobalValue *GV) const; @@ -880,6 +887,7 @@ class VectorType; SDValue LowerUnsignedALUO(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; @@ -1025,6 +1033,7 @@ class VectorType; MachineBasicBlock *MBB) const; MachineBasicBlock *EmitLowered__dbzchk(MachineInstr &MI, MachineBasicBlock *MBB) const; + void addMVEVectorTypes(bool HasMVEFP); void addAllExtLoads(const MVT From, const MVT To, LegalizeAction Action); void setAllExpand(MVT VT); diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 282ff534fc112..b8597f97b43df 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -32,6 +32,13 @@ def SDT_ARMSaveCallPC : SDTypeProfile<0, 1, []>; def SDT_ARMcall : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>; +def SDT_ARMCtSelect : SDTypeProfile<1, 3, [ + /* any */ // result + SDTCisSameAs<1, 0>, // value on false + SDTCisSameAs<2, 0>, // value on true + SDTCisVT<3, i32> // cond +]>; + def SDT_ARMCMov : SDTypeProfile<1, 4, [ /* any */ // result SDTCisSameAs<1, 0>, // value on false @@ -188,6 +195,7 @@ def ARMseretglue : SDNode<"ARMISD::SERET_GLUE", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; def ARMintretglue : SDNode<"ARMISD::INTRET_GLUE", SDT_ARMcall, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; +def ARMctselect : SDNode<"ARMISD::CTSELECT", SDT_ARMCtSelect>; def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov>; def ARMssat : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>; @@ -5108,6 +5116,226 @@ def : ARMPat<(ARMcmov i32:$false, mod_imm_not:$imm, imm:$cc, CPSR), def : ARMV6T2Pat<(ARMcmov i32:$false, imm:$src, imm:$cc, CPSR), (MOVCCi32imm $false, imm:$src, imm:$cc, CPSR)>; +//===----------------------------------------------------------------------===// +// Constant-time selection pseudoinstructions. +// We use a machine pass to lower these pseudos as applicable by subtarget, +// in order to avoid backend optimizations that could invalidate constant-time +// guarantees to the source programmer by node merging or other operations that +// would result in machine code that does not run in constant time. +let isNotDuplicable = 1, + isPseudo = 1, + hasNoSchedulingInfo = 1 in { + + // i1, i8, i16, i32, i64 + def CTSELECTint : ARMPseudoInst< + (outs GPR:$dst, GPR:$tmp_mask), + (ins GPR:$src1, GPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask"; + } + + def CTSELECTf16 : ARMPseudoInst< + (outs HPR:$dst, GPR:$tmp_mask, GPR:$scratch1, GPR:$scratch2), + (ins HPR:$src1, HPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $scratch1,@earlyclobber $scratch2"; + } + + def CTSELECTbf16 : ARMPseudoInst< + (outs HPR:$dst, GPR:$tmp_mask, GPR:$scratch1, GPR:$scratch2), + (ins HPR:$src1, HPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $scratch1,@earlyclobber $scratch2"; + } + + def CTSELECTf32 : ARMPseudoInst< + (outs SPR:$dst, GPR:$tmp_mask, GPR:$scratch1, GPR:$scratch2), + (ins SPR:$src1, SPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $scratch1,@earlyclobber $scratch2"; + } + + let Predicates = [HasDPVFP] in { + def CTSELECTf64 : ARMPseudoInst< + (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask), + (ins DPR:$src1, DPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + } + + let Predicates = [HasNEON] in { + // DPR + def CTSELECTv8i8 : ARMPseudoInst< + (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask), + (ins DPR:$src1, DPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv4i16 : ARMPseudoInst< + (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask), + (ins DPR:$src1, DPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv2i32 : ARMPseudoInst< + (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask), + (ins DPR:$src1, DPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv1i64 : ARMPseudoInst< + (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask), + (ins DPR:$src1, DPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv2f32 : ARMPseudoInst< + (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask), + (ins DPR:$src1, DPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv4f16 : ARMPseudoInst< + (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask), + (ins DPR:$src1, DPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv4bf16 : ARMPseudoInst< + (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask), + (ins DPR:$src1, DPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + + // QPR + def CTSELECTv16i8 : ARMPseudoInst< + (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask), + (ins QPR:$src1, QPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv8i16 : ARMPseudoInst< + (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask), + (ins QPR:$src1, QPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv4i32 : ARMPseudoInst< + (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask), + (ins QPR:$src1, QPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv2i64 : ARMPseudoInst< + (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask), + (ins QPR:$src1, QPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv4f32 : ARMPseudoInst< + (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask), + (ins QPR:$src1, QPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv2f64 : ARMPseudoInst< + (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask), + (ins QPR:$src1, QPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv8f16 : ARMPseudoInst< + (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask), + (ins QPR:$src1, QPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + def CTSELECTv8bf16 : ARMPseudoInst< + (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask), + (ins QPR:$src1, QPR:$src2, GPR:$cond), + 4, + NoItinerary, + [] + > { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask"; + } + + } +} + //===----------------------------------------------------------------------===// // Atomic operations intrinsics // diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp index 86740a92b32c5..18d47d9c68767 100644 --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -519,7 +519,8 @@ void ARMPassConfig::addPreEmitPass() { // Constant island pass work on unbundled instructions. addPass(createUnpackMachineBundles([](const MachineFunction &MF) { - return MF.getSubtarget().isThumb2(); + return MF.getSubtarget().isThumb2() || + MF.getFunction().hasFnAttribute("ct-select"); })); // Don't optimize barriers or block placement at -O0. diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 8e08d16342975..d306d489a43d2 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -828,9 +828,10 @@ include "X86SchedSapphireRapids.td" def ProcessorFeatures { // x86-64 micro-architecture levels: x86-64 and x86-64-v[234] - list X86_64V1Features = [ - FeatureX87, FeatureCX8, FeatureCMOV, FeatureMMX, FeatureSSE2, - FeatureFXSR, FeatureNOPL, FeatureX86_64, + list X86_64V1Features = [FeatureX87, FeatureCX8, + FeatureCMOV, FeatureMMX, + FeatureSSE2, FeatureFXSR, + FeatureNOPL, FeatureX86_64, ]; list X86_64V1Tuning = [ TuningMacroFusion, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a0b64ff370b10..a11ef3833b69b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "X86ISelLowering.h" +#include "MCTargetDesc/X86MCTargetDesc.h" #include "MCTargetDesc/X86ShuffleDecode.h" #include "X86.h" #include "X86FrameLowering.h" @@ -29,6 +30,8 @@ #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -48,6 +51,7 @@ #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/PatternMatch.h" @@ -488,6 +492,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // X86 wants to expand cmov itself. for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) { setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); @@ -496,11 +501,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); } // Custom action for SELECT MMX and expand action for SELECT_CC MMX setOperationAction(ISD::SELECT, MVT::x86mmx, Custom); + setOperationAction(ISD::CTSELECT, MVT::x86mmx, Custom); setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand); setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); @@ -630,6 +637,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BR_CC, VT, Action); setOperationAction(ISD::SETCC, VT, Action); setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Action); setOperationAction(ISD::FROUND, VT, Action); setOperationAction(ISD::FROUNDEVEN, VT, Action); @@ -1067,6 +1075,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); + setOperationAction(ISD::CTSELECT, MVT::v4f32, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Custom); setOperationAction(ISD::LOAD, MVT::v2f32, Custom); @@ -1220,6 +1229,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v8f16, Custom); setOperationAction(ISD::SELECT, MVT::v16i8, Custom); + setOperationAction(ISD::CTSELECT, MVT::v2f64, Custom); + setOperationAction(ISD::CTSELECT, MVT::v2i64, Custom); + setOperationAction(ISD::CTSELECT, MVT::v4i32, Custom); + setOperationAction(ISD::CTSELECT, MVT::v8i16, Custom); + setOperationAction(ISD::CTSELECT, MVT::v8f16, Custom); + setOperationAction(ISD::CTSELECT, MVT::v16i8, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); @@ -1541,6 +1557,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v32i8, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); + setOperationAction(ISD::CTSELECT, MVT::v4f64, Custom); + setOperationAction(ISD::CTSELECT, MVT::v4i64, Custom); + setOperationAction(ISD::CTSELECT, MVT::v8i32, Custom); + setOperationAction(ISD::CTSELECT, MVT::v16i16, Custom); + setOperationAction(ISD::CTSELECT, MVT::v16f16, Custom); + setOperationAction(ISD::CTSELECT, MVT::v32i8, Custom); + setOperationAction(ISD::CTSELECT, MVT::v8f32, Custom); + for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::SIGN_EXTEND, VT, Custom); setOperationAction(ISD::ZERO_EXTEND, VT, Custom); @@ -1727,6 +1751,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v16i1, &X86::VK16RegClass); setOperationAction(ISD::SELECT, MVT::v1i1, Custom); + setOperationAction(ISD::CTSELECT, MVT::v1i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); @@ -1772,6 +1797,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); @@ -2038,6 +2064,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); @@ -2203,6 +2230,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); @@ -2269,6 +2297,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VSELECT, VT, Legal); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::CTSELECT, VT, Custom); setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); @@ -2538,6 +2567,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::x86amx, &X86::TILERegClass); } + // Handle 512-bit vector CTSELECT without AVX512 by setting them to Expand + // This allows type legalization to split them into smaller vectors + for (auto VT : {MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64, MVT::v32f16, + MVT::v16f32, MVT::v8f64}) { + setOperationAction(ISD::CTSELECT, VT, Expand); + } + + // Handle 256-bit vector CTSELECT without AVX by setting them to Expand + // This allows type legalization to split them into 128-bit vectors + if (!Subtarget.hasAVX()) { + for (auto VT : {MVT::v4f64, MVT::v4i64, MVT::v8i32, MVT::v16i16, + MVT::v16f16, MVT::v32i8, MVT::v8f32}) { + setOperationAction(ISD::CTSELECT, VT, Expand); + } + } + // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); @@ -2643,6 +2688,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, ISD::BITCAST, ISD::VSELECT, ISD::SELECT, + ISD::CTSELECT, ISD::SHL, ISD::SRA, ISD::SRL, @@ -25321,6 +25367,174 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, return V; } +SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const { + SDValue Cond = Op.getOperand(0); // condition + SDValue TrueOp = Op.getOperand(1); // true_value + SDValue FalseOp = Op.getOperand(2); // false_value + SDLoc DL(Op); + MVT VT = TrueOp.getSimpleValueType(); + + // Special handling for i386 targets (no CMOV) - route to post-RA expansion + // pseudos Let standard type legalization handle i64 automatically (splits + // into EDX:EAX) + + // Handle soft float16 by converting to integer operations + if (isSoftF16(VT, Subtarget)) { + MVT NVT = VT.changeTypeToInteger(); + SDValue CtSelect = + DAG.getNode(ISD::CTSELECT, DL, NVT, Cond, DAG.getBitcast(NVT, FalseOp), + DAG.getBitcast(NVT, TrueOp)); + return DAG.getBitcast(VT, CtSelect); + } + + // Handle vector types + if (VT.isVector()) { + // Handle soft float16 vectors + if (isSoftF16(VT, Subtarget)) { + MVT NVT = VT.changeVectorElementTypeToInteger(); + SDValue CtSelect = DAG.getNode(ISD::CTSELECT, DL, NVT, Cond, + DAG.getBitcast(NVT, FalseOp), + DAG.getBitcast(NVT, TrueOp)); + return DAG.getBitcast(VT, CtSelect); + } + + unsigned VectorWidth = VT.getSizeInBits(); + MVT EltVT = VT.getVectorElementType(); + + // 512-bit vectors without AVX512 are now handled by type legalization + // (Expand action) 256-bit vectors without AVX are now handled by type + // legalization (Expand action) + + if (VectorWidth == 128 && !Subtarget.hasSSE1()) + return SDValue(); + + // Handle special cases for floating point vectors + if (EltVT.isFloatingPoint()) { + // For vector floating point with AVX, use VBLENDV-style operations + if (Subtarget.hasAVX() && (VectorWidth == 256 || VectorWidth == 128)) { + // Convert to bitwise operations using the condition + MVT IntVT = VT.changeVectorElementTypeToInteger(); + SDValue IntOp1 = DAG.getBitcast(IntVT, TrueOp); + SDValue IntOp2 = DAG.getBitcast(IntVT, FalseOp); + + // Create the CTSELECT node with integer types + SDValue IntResult = + DAG.getNode(X86ISD::CTSELECT, DL, IntVT, IntOp2, IntOp1, + DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), + EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget)); + return DAG.getBitcast(VT, IntResult); + } + } + + // For integer vectors or when we don't have advanced SIMD support, + // use the generic X86 CTSELECT node which will be matched by the patterns + SDValue CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8); + SDValue EFLAGS = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget); + // Create the X86 CTSELECT node - note operand order: true, false, cc, flags + return DAG.getNode(X86ISD::CTSELECT, DL, VT, FalseOp, TrueOp, CC, EFLAGS); + } + + // Look past (and (setcc_carry (cmp ...)), 1) + if (Cond.getOpcode() == ISD::AND && + Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && + isOneConstant(Cond.getOperand(1))) + Cond = Cond.getOperand(0); + + /// Process condition flags and prepare for CTSELECT node creation + auto ProcessConditionFlags = + [&](SDValue Cond, MVT VT, SDLoc DL, SelectionDAG &DAG, + const X86Subtarget &Subtarget) -> std::pair { + SDValue CC; + bool AddTest = true; + + unsigned CondOpcode = Cond.getOpcode(); + if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) { + CC = Cond.getOperand(0); + SDValue Cmp = Cond.getOperand(1); + + if ((isX86LogicalCmp(Cmp)) || Cmp.getOpcode() == X86ISD::BT) { + Cond = Cmp; + AddTest = false; + } + } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || + CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || + CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) { + SDValue Value; + X86::CondCode X86Cond; + std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG); + CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8); + AddTest = false; + } + + if (AddTest) { + // Look past the truncate if the high bits are known zero + if (isTruncWithZeroHighBitsInput(Cond, DAG)) + Cond = Cond.getOperand(0); + + // Try to match AND to BT instruction + if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { + X86::CondCode X86CondCode; + if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) { + CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8); + Cond = BT; + AddTest = false; + } + } + } + + if (AddTest) { + CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8); + Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget); + } + + return {CC, Cond}; + }; + + // Process condition flags and prepare for CTSELECT + auto [CC, ProcessedCond] = + ProcessConditionFlags(Cond, VT, DL, DAG, Subtarget); + + // Handle i8 CTSELECT with truncate optimization + if (Op.getValueType() == MVT::i8 && TrueOp.getOpcode() == ISD::TRUNCATE && + FalseOp.getOpcode() == ISD::TRUNCATE) { + SDValue T1 = TrueOp.getOperand(0), T2 = FalseOp.getOperand(0); + if (T1.getValueType() == T2.getValueType() && + T1.getOpcode() != ISD::CopyFromReg && + T2.getOpcode() != ISD::CopyFromReg) { + SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, T1.getValueType(), + T2, T1, CC, ProcessedCond); + return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect); + } + } + + // Promote small integer types to avoid partial register stalls + // Exception: For i8 without CMOV, we can generate a shorter instruction + // sequence without movzx so keep it as is. + if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMOV()) || + (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(TrueOp, Subtarget) && + !X86::mayFoldLoad(FalseOp, Subtarget))) { + TrueOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, TrueOp); + FalseOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, FalseOp); + SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond}; + SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Ops); + return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect); + } + + if (isScalarFPTypeInSSEReg(VT)) { + MVT IntVT = (VT == MVT::f32) ? MVT::i32 : MVT::i64; + TrueOp = DAG.getBitcast(IntVT, TrueOp); + FalseOp = DAG.getBitcast(IntVT, FalseOp); + SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond}; + SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, IntVT, Ops); + return DAG.getBitcast(VT, CtSelect); + } + + // Create final CTSELECT node + SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond}; + return DAG.getNode(X86ISD::CTSELECT, DL, Op.getValueType(), Ops, + Op->getFlags()); +} + static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue In = Op->getOperand(0); @@ -33684,6 +33898,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG); case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); + case ISD::CTSELECT: return LowerCTSELECT(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); @@ -33767,6 +33982,12 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { } } +bool X86TargetLowering::isSelectSupported(SelectSupportKind Kind) const { + if (Kind == SelectSupportKind::CtSelect) { + return true; + } + return TargetLoweringBase::isSelectSupported(Kind); +} /// Replace a node with an illegal result type with a new node built out of /// custom code. void X86TargetLowering::ReplaceNodeResults(SDNode *N, @@ -34994,6 +35215,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(STRICT_CMPM) NODE_NAME_CASE(CMPMM_SAE) NODE_NAME_CASE(SETCC) + NODE_NAME_CASE(CTSELECT) NODE_NAME_CASE(SETCC_CARRY) NODE_NAME_CASE(FSETCC) NODE_NAME_CASE(FSETCCM) @@ -37767,6 +37989,480 @@ X86TargetLowering::emitPatchableEventCall(MachineInstr &MI, return BB; } +/// Helper function to emit i386 CTSELECT with condition materialization. +/// This converts EFLAGS-based CTSELECT into a condition byte that can be +/// shared across multiple operations (critical for i64 type legalization). +/// +/// Phase 1: Materialize condition byte from EFLAGS using SETCC +/// Phase 2: Create internal pseudo with condition byte for post-RA expansion +/// +/// This approach ensures that when i64 is type-legalized into two i32 +/// operations, both operations share the same condition byte rather than +/// each independently reading (and destroying) EFLAGS. +static MachineBasicBlock * +emitCTSelectI386WithConditionMaterialization(MachineInstr &MI, + MachineBasicBlock *BB, + unsigned InternalPseudoOpcode) { + const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); + const MIMetadata MIMD(MI); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + // Original pseudo operands: (outs dst), (ins src1, src2, cond) + Register Src1Reg = MI.getOperand(1).getReg(); + Register Src2Reg = MI.getOperand(2).getReg(); + X86::CondCode CC = static_cast(MI.getOperand(3).getImm()); + + // Get opposite condition (SETCC sets to 1 when condition is TRUE, + // but we want to select src1 when condition is FALSE for X86 semantics) + X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); + + // Step 1: Materialize condition byte from EFLAGS + // This is done OUTSIDE the constant-time bundle, before any EFLAGS corruption + Register CondByteReg = MRI.createVirtualRegister(&X86::GR8RegClass); + BuildMI(*BB, MI, MIMD, TII->get(X86::SETCCr), CondByteReg).addImm(OppCC); + + // Step 2: Create internal pseudo that takes condition byte as input + // This pseudo will be expanded post-RA into the actual constant-time bundle + // The condition byte can now be safely shared between multiple pseudos + + // Internal pseudo has operands: (outs dst, tmp_byte, tmp_mask), (ins src1, + // src2, cond_byte) + Register DstReg = MI.getOperand(0).getReg(); + + // Create virtual registers for the temporary outputs + Register TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass); + Register TmpMaskReg; + + // Determine the register class for tmp_mask based on the data type + if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR8rr) { + TmpMaskReg = MRI.createVirtualRegister(&X86::GR8RegClass); + } else if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR16rr) { + TmpMaskReg = MRI.createVirtualRegister(&X86::GR16RegClass); + } else if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR32rr) { + TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass); + } else { + llvm_unreachable("Unknown internal pseudo opcode"); + } + + BuildMI(*BB, MI, MIMD, TII->get(InternalPseudoOpcode)) + .addDef(DstReg) // dst (output) + .addDef(TmpByteReg) // tmp_byte (output) + .addDef(TmpMaskReg) // tmp_mask (output) + .addReg(Src1Reg) // src1 (input) + .addReg(Src2Reg) // src2 (input) + .addReg(CondByteReg); // pre-materialized condition byte (input) + + MI.eraseFromParent(); + return BB; +} + +// Helper structure to hold memory operand information for FP loads +struct FPLoadMemOperands { + bool IsValid = false; + unsigned BaseReg = 0; + int64_t ScaleVal = 1; + unsigned IndexReg = 0; + int64_t Disp = 0; + unsigned SegReg = 0; + int FrameIndex = -1; + bool IsFrameIndex = false; + int ConstantPoolIndex = -1; + bool IsConstantPool = false; + const GlobalValue *Global = nullptr; + int64_t GlobalOffset = 0; + bool IsGlobal = false; +}; + +// Check if a virtual register is defined by a simple FP load instruction +// Returns the memory operands if it's a simple load, otherwise returns invalid +static FPLoadMemOperands getFPLoadMemOperands(Register Reg, + MachineRegisterInfo &MRI, + unsigned ExpectedLoadOpcode) { + FPLoadMemOperands Result; + + if (!Reg.isVirtual()) + return Result; + + MachineInstr *DefMI = MRI.getVRegDef(Reg); + if (!DefMI) + return Result; + + // Check if it's the expected load opcode (e.g., LD_Fp32m, LD_Fp64m, LD_Fp80m) + if (DefMI->getOpcode() != ExpectedLoadOpcode) + return Result; + + // Check that this is a simple load - not volatile, not atomic, etc. + // FP loads have hasSideEffects = 0 in their definition for simple loads + if (DefMI->hasOrderedMemoryRef()) + return Result; + + // The load should have a single def (the destination register) and memory operands + // Format: %reg = LD_Fpxxm , 1, %noreg, 0, %noreg + // or: %reg = LD_Fpxxm %base, scale, %index, disp, %segment + if (DefMI->getNumOperands() < 6) + return Result; + + // Operand 0 is the destination, operands 1-5 are the memory reference + MachineOperand &BaseMO = DefMI->getOperand(1); + MachineOperand &ScaleMO = DefMI->getOperand(2); + MachineOperand &IndexMO = DefMI->getOperand(3); + MachineOperand &DispMO = DefMI->getOperand(4); + MachineOperand &SegMO = DefMI->getOperand(5); + + // Check if this is a frame index load + if (BaseMO.isFI()) { + Result.IsValid = true; + Result.IsFrameIndex = true; + Result.FrameIndex = BaseMO.getIndex(); + Result.ScaleVal = ScaleMO.getImm(); + Result.IndexReg = IndexMO.getReg(); + Result.Disp = DispMO.getImm(); + Result.SegReg = SegMO.getReg(); + return Result; + } + + // Check if this is a constant pool load + // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, %const.N, $noreg + if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister && + ScaleMO.isImm() && IndexMO.isReg() && + IndexMO.getReg() == X86::NoRegister && + DispMO.isCPI() && SegMO.isReg()) { + Result.IsValid = true; + Result.IsConstantPool = true; + Result.ConstantPoolIndex = DispMO.getIndex(); + Result.ScaleVal = ScaleMO.getImm(); + Result.IndexReg = IndexMO.getReg(); + Result.Disp = 0; + Result.SegReg = SegMO.getReg(); + return Result; + } + + // Check if this is a global variable load + // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, @global_name, $noreg + if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister && + ScaleMO.isImm() && IndexMO.isReg() && + IndexMO.getReg() == X86::NoRegister && + DispMO.isGlobal() && SegMO.isReg()) { + Result.IsValid = true; + Result.IsGlobal = true; + Result.Global = DispMO.getGlobal(); + Result.GlobalOffset = DispMO.getOffset(); + Result.ScaleVal = ScaleMO.getImm(); + Result.IndexReg = IndexMO.getReg(); + Result.Disp = 0; + Result.SegReg = SegMO.getReg(); + return Result; + } + + // Regular memory operands (e.g., pointer loads) + if (BaseMO.isReg() && ScaleMO.isImm() && IndexMO.isReg() && + DispMO.isImm() && SegMO.isReg()) { + Result.IsValid = true; + Result.IsFrameIndex = false; + Result.IsConstantPool = false; + Result.BaseReg = BaseMO.getReg(); + Result.ScaleVal = ScaleMO.getImm(); + Result.IndexReg = IndexMO.getReg(); + Result.Disp = DispMO.getImm(); + Result.SegReg = SegMO.getReg(); + return Result; + } + + return Result; +} + +static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI, + MachineBasicBlock *BB, + unsigned pseudoInstr) { + const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); + const MIMetadata MIMD(MI); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineFrameInfo &MFI = MF->getFrameInfo(); + unsigned RegSizeInByte = 4; + + // Get operands + // MI operands: %result:rfp80 = CTSELECT_I386 %false:rfp80, %true:rfp80, %cond:i8imm + unsigned DestReg = MI.getOperand(0).getReg(); + unsigned FalseReg = MI.getOperand(1).getReg(); + unsigned TrueReg = MI.getOperand(2).getReg(); + X86::CondCode CC = static_cast(MI.getOperand(3).getImm()); + X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); + + // Materialize condition byte from EFLAGS + Register CondByteReg = MRI.createVirtualRegister(&X86::GR8RegClass); + BuildMI(*BB, MI, MIMD, TII->get(X86::SETCCr), CondByteReg).addImm(OppCC); + + auto storeFpToSlot = [&](unsigned Opcode, int Slot, Register Reg) { + addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(Opcode)), Slot) + .addReg(Reg, RegState::Kill); + }; + + // Helper to load integer from memory operands + auto loadIntFromMemOperands = [&](const FPLoadMemOperands &MemOps, + unsigned Offset) -> unsigned { + unsigned IntReg = MRI.createVirtualRegister(&X86::GR32RegClass); + MachineInstrBuilder MIB = + BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), IntReg); + + if (MemOps.IsFrameIndex) { + // Frame index: addFrameIndex + scale + index + disp + segment + MIB.addFrameIndex(MemOps.FrameIndex) + .addImm(MemOps.ScaleVal) + .addReg(MemOps.IndexReg) + .addImm(MemOps.Disp + Offset) + .addReg(MemOps.SegReg); + } else if (MemOps.IsConstantPool) { + // Constant pool: base_reg + scale + index + CP_index + segment + // MOV32rm format: base, scale, index, displacement, segment + MIB.addReg(X86::NoRegister) // Base register + .addImm(MemOps.ScaleVal) // Scale + .addReg(MemOps.IndexReg) // Index register + .addConstantPoolIndex(MemOps.ConstantPoolIndex, Offset) // Displacement (CP index) + .addReg(MemOps.SegReg); // Segment + } else if (MemOps.IsGlobal) { + // Global variable: base_reg + scale + index + global + segment + // MOV32rm format: base, scale, index, displacement, segment + MIB.addReg(X86::NoRegister) // Base register + .addImm(MemOps.ScaleVal) // Scale + .addReg(MemOps.IndexReg) // Index register + .addGlobalAddress(MemOps.Global, MemOps.GlobalOffset + Offset) // Displacement (global address) + .addReg(MemOps.SegReg); // Segment + } else { + // Regular memory: base_reg + scale + index + disp + segment + MIB.addReg(MemOps.BaseReg) + .addImm(MemOps.ScaleVal) + .addReg(MemOps.IndexReg) + .addImm(MemOps.Disp + Offset) + .addReg(MemOps.SegReg); + } + + return IntReg; + }; + + // Optimized path: load integers directly from memory when both operands are + // memory loads, avoiding FP register round-trip + auto emitCtSelectFromMemory = [&](unsigned NumValues, + const FPLoadMemOperands &TrueMemOps, + const FPLoadMemOperands &FalseMemOps, + int ResultSlot) { + for (unsigned Val = 0; Val < NumValues; ++Val) { + unsigned Offset = Val * RegSizeInByte; + + // Load true and false values directly from their memory locations as integers + unsigned TrueIntReg = loadIntFromMemOperands(TrueMemOps, Offset); + unsigned FalseIntReg = loadIntFromMemOperands(FalseMemOps, Offset); + + // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time selection + unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass); + unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass); + unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass); + + BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr)) + .addDef(ResultIntReg) // dst (output) + .addDef(TmpByteReg) // tmp_byte (output) + .addDef(TmpMaskReg) // tmp_mask (output) + .addReg(FalseIntReg) // src1 (input) - false value + .addReg(TrueIntReg) // src2 (input) - true value + .addReg(CondByteReg); // pre-materialized condition byte (input) + + // Store result back to result slot + BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr)) + .addFrameIndex(ResultSlot) + .addImm(1) + .addReg(0) + .addImm(Offset) + .addReg(0) + .addReg(ResultIntReg, RegState::Kill); + } + }; + + auto emitCtSelectWithPseudo = [&](unsigned NumValues, int TrueSlot, int FalseSlot, int ResultSlot) { + for (unsigned Val = 0; Val < NumValues; ++Val) { + unsigned Offset = Val * RegSizeInByte; + + // Load true and false values from stack as 32-bit integers + unsigned TrueIntReg = MRI.createVirtualRegister(&X86::GR32RegClass); + BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), TrueIntReg) + .addFrameIndex(TrueSlot) + .addImm(1) + .addReg(0) + .addImm(Offset) + .addReg(0); + + unsigned FalseIntReg = MRI.createVirtualRegister(&X86::GR32RegClass); + BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), FalseIntReg) + .addFrameIndex(FalseSlot) + .addImm(1) + .addReg(0) + .addImm(Offset) + .addReg(0); + + // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time selection + unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass); + unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass); + unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass); + + BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr)) + .addDef(ResultIntReg) // dst (output) + .addDef(TmpByteReg) // tmp_byte (output) + .addDef(TmpMaskReg) // tmp_mask (output) + .addReg(FalseIntReg) // src1 (input) - false value + .addReg(TrueIntReg) // src2 (input) - true value + .addReg(CondByteReg); // pre-materialized condition byte (input) + + // Store result back to result slot + BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr)) + .addFrameIndex(ResultSlot) + .addImm(1) + .addReg(0) + .addImm(Offset) + .addReg(0) + .addReg(ResultIntReg, RegState::Kill); + } + }; + + switch (pseudoInstr) { + case X86::CTSELECT_I386_FP32rr: { + // Check if both operands are simple memory loads + FPLoadMemOperands TrueMemOps = + getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp32m); + FPLoadMemOperands FalseMemOps = + getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp32m); + + int ResultSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false); + + if (TrueMemOps.IsValid && FalseMemOps.IsValid) { + // Optimized path: load directly from memory as integers + // Works for both frame index loads (stack parameters) and + // constant pool loads (constants) + emitCtSelectFromMemory(1, TrueMemOps, FalseMemOps, ResultSlot); + + // Erase the original FP load instructions since we're not using them + // and have loaded the data directly as integers instead + if (MRI.hasOneUse(TrueReg)) { + if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg)) + TrueDefMI->eraseFromParent(); + } + if (MRI.hasOneUse(FalseReg)) { + if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg)) + FalseDefMI->eraseFromParent(); + } + } else { + // General path: spill FP registers to stack first + int TrueSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false); + int FalseSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false); + + storeFpToSlot(X86::ST_Fp32m, TrueSlot, TrueReg); + storeFpToSlot(X86::ST_Fp32m, FalseSlot, FalseReg); + + emitCtSelectWithPseudo(1, TrueSlot, FalseSlot, ResultSlot); + } + + // Load result back as f32 + addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp32m), DestReg), + ResultSlot); + break; + } + case X86::CTSELECT_I386_FP64rr: { + unsigned StackSlotSize = 8; + + // Check if both operands are simple memory loads + FPLoadMemOperands TrueMemOps = + getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp64m); + FPLoadMemOperands FalseMemOps = + getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp64m); + + int ResultSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false); + + if (TrueMemOps.IsValid && FalseMemOps.IsValid) { + // Optimized path: load directly from memory as integers + // Works for both frame index loads (stack parameters) and + // constant pool loads (constants) + emitCtSelectFromMemory(StackSlotSize / RegSizeInByte, TrueMemOps, + FalseMemOps, ResultSlot); + + // Erase the original FP load instructions since we're not using them + if (MRI.hasOneUse(TrueReg)) { + if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg)) + TrueDefMI->eraseFromParent(); + } + if (MRI.hasOneUse(FalseReg)) { + if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg)) + FalseDefMI->eraseFromParent(); + } + } else { + // General path: spill FP registers to stack first + int TrueSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false); + int FalseSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false); + + storeFpToSlot(X86::ST_Fp64m, TrueSlot, TrueReg); + storeFpToSlot(X86::ST_Fp64m, FalseSlot, FalseReg); + + emitCtSelectWithPseudo(StackSlotSize / RegSizeInByte, TrueSlot, FalseSlot, + ResultSlot); + } + + // Load result back as f64 + addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp64m), DestReg), + ResultSlot); + break; + } + case X86::CTSELECT_I386_FP80rr: { + // f80 is 80 bits (10 bytes), but stored with 12-byte alignment + unsigned StackObjectSize = 12; + + // Check if both operands are simple memory loads + FPLoadMemOperands TrueMemOps = + getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp80m); + FPLoadMemOperands FalseMemOps = + getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp80m); + + int ResultSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false); + + if (TrueMemOps.IsValid && FalseMemOps.IsValid) { + // Optimized path: load directly from memory as integers + // Works for both frame index loads (stack parameters) and + // constant pool loads (constants) + emitCtSelectFromMemory(StackObjectSize / RegSizeInByte, TrueMemOps, + FalseMemOps, ResultSlot); + + // Erase the original FP load instructions since we're not using them + if (MRI.hasOneUse(TrueReg)) { + if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg)) + TrueDefMI->eraseFromParent(); + } + if (MRI.hasOneUse(FalseReg)) { + if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg)) + FalseDefMI->eraseFromParent(); + } + } else { + // General path: spill FP registers to stack first + int TrueSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false); + int FalseSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false); + + storeFpToSlot(X86::ST_FpP80m, TrueSlot, TrueReg); + storeFpToSlot(X86::ST_FpP80m, FalseSlot, FalseReg); + + emitCtSelectWithPseudo(StackObjectSize / RegSizeInByte, TrueSlot, + FalseSlot, ResultSlot); + } + + // Load result back as f80 + addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp80m), DestReg), + ResultSlot); + break; + } + default: + llvm_unreachable("Invalid CTSELECT opcode"); + } + + MI.eraseFromParent(); + + return BB; +} + MachineBasicBlock * X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { @@ -37828,6 +38524,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::CMOV_VK64: return EmitLoweredSelect(MI, BB); + case X86::CTSELECT_I386_GR8rr: + return emitCTSelectI386WithConditionMaterialization( + MI, BB, X86::CTSELECT_I386_INT_GR8rr); + + case X86::CTSELECT_I386_GR16rr: + return emitCTSelectI386WithConditionMaterialization( + MI, BB, X86::CTSELECT_I386_INT_GR16rr); + + case X86::CTSELECT_I386_GR32rr: + return emitCTSelectI386WithConditionMaterialization( + MI, BB, X86::CTSELECT_I386_INT_GR32rr); + + case X86::CTSELECT_I386_FP32rr: + return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP32rr); + case X86::CTSELECT_I386_FP64rr: + return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP64rr); + case X86::CTSELECT_I386_FP80rr: + return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP80rr); + case X86::FP80_ADDr: case X86::FP80_ADDm32: { // Change the floating point control register to use double extended diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index e28b9c11a04cd..f79eec03de23c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -114,6 +114,10 @@ namespace llvm { /// X86 Select SELECTS, + /// X86 Constant-time Select, implemented with CMOV instruction. This is + /// used to implement constant-time select. + CTSELECT, + // Same as SETCC except it's materialized with a sbb and the value is all // one's or all zero's. SETCC_CARRY, // R = carry_bit ? ~0 : 0 @@ -1139,6 +1143,8 @@ namespace llvm { /// SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + bool isSelectSupported(SelectSupportKind Kind) const override; + /// Replace the results of node with an illegal result /// type with new values built out of custom code. /// @@ -1766,6 +1772,7 @@ namespace llvm { SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td index 7d5d7cf4a83ab..9c34889f03354 100644 --- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td +++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td @@ -106,6 +106,211 @@ let Predicates = [HasCMOV, HasNDD] in { def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, timm:$cond, EFLAGS), (CMOV64rm_ND GR64:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>; } + +// Create pseudo instruction and do the pattern matching to them. +// We use a machine pass to lower these pseudos into cmov, in order +// to avoid backend optimizations +let Uses = [EFLAGS], isNotDuplicable = 1, isPseudo = 1 in { + + multiclass CTSELECT { + // register-only + let isCommutable = 0, SchedRW = [WriteCMOV], Predicates = [HasNativeCMOV], + AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in { + def rr : PseudoI<(outs t.RegClass:$dst), + (ins t.RegClass:$src1, t.RegClass:$src2, i8imm:$cond), + [(set t.RegClass:$dst, (X86ctselect t.RegClass:$src1, t.RegClass:$src2, timm:$cond, EFLAGS))]>; + } + + // register-memory + let SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold], Predicates = [HasNativeCMOV], + AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in { + def rm : PseudoI<(outs t.RegClass:$dst), + (ins t.RegClass:$src1, t.MemOperand:$src2, i8imm:$cond), + [(set t.RegClass:$dst, (X86ctselect t.RegClass:$src1, (t.LoadNode addr:$src2), timm:$cond, EFLAGS))]>; + } + } +} + +let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in { + let Constraints = "$dst = $src1" in { + defm CTSELECT16 : CTSELECT; + defm CTSELECT32 : CTSELECT; + defm CTSELECT64 : CTSELECT; + } +} + +// CTSELECT_VEC base class +class CTSELECT_VEC + : PseudoI< + (outs VRc:$dst, VRc:$tmpx, GRc:$tmpg), + (ins VRc:$t, VRc:$f, i8imm:$cond), + [] + > { + let Uses = [EFLAGS]; + let isPseudo = 1; + let isNotDuplicable = 1; + let hasSideEffects = 1; + let AsmString = "ctselect\t$dst, $f, $t, $cond"; + let SchedRW = []; +} + +// Width-specific class aliases +class CTSELECT_VEC128 : CTSELECT_VEC; +class CTSELECT_VEC128X : CTSELECT_VEC; +class CTSELECT_VEC256 : CTSELECT_VEC; +class CTSELECT_VEC512 : CTSELECT_VEC; + + +//===----------------------------------------------------------------------===// +// 128-bit pseudos (SSE2 baseline; we use PXOR/PAND/MOVD/PSHUFD in the expander) +//===----------------------------------------------------------------------===// + +let Predicates = [HasSSE1] in { + + def CTSELECT_V4F32 : CTSELECT_VEC128 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } +} + +let Predicates = [HasSSE2] in { + + def CTSELECT_V2F64 : CTSELECT_VEC128 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V4I32 : CTSELECT_VEC128 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V2I64 : CTSELECT_VEC128 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V8I16 : CTSELECT_VEC128 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V16I8 : CTSELECT_VEC128 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + + // If your build has v8f16, keep this; otherwise comment it out. + def CTSELECT_V8F16 : CTSELECT_VEC128 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } +} + +let Predicates = [HasAVX] in { + + def CTSELECT_V4F32X : CTSELECT_VEC128X { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V2F64X : CTSELECT_VEC128X { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V4I32X : CTSELECT_VEC128X { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V2I64X : CTSELECT_VEC128X { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V8I16X : CTSELECT_VEC128X { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V16I8X : CTSELECT_VEC128X { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + + // If your build has v8f16, keep this; otherwise comment it out. + def CTSELECT_V8F16X : CTSELECT_VEC128X { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } +} + +//===----------------------------------------------------------------------===// +// 256-bit pseudos +//===----------------------------------------------------------------------===// +let Predicates = [HasAVX] in { + + def CTSELECT_V8F32 : CTSELECT_VEC256 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V4F64 : CTSELECT_VEC256 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V8I32 : CTSELECT_VEC256 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V4I64 : CTSELECT_VEC256 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V16I16 : CTSELECT_VEC256 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + def CTSELECT_V32I8 : CTSELECT_VEC256 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } + + // If your build has v16f16, keep this; otherwise comment it out. + def CTSELECT_V16F16 : CTSELECT_VEC256 { + let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg"; + } +} + +//===----------------------------------------------------------------------===// +// Selection patterns: X86ctselect(...), EFLAGS -> CTSELECT_V* +// +// NOTE: +// * The SDNode carries Glue from CMP/TEST (due to SDNPInGlue). +// * We list EFLAGS explicitly in the pattern (X86 style) to model the arch read. +// * Temps (tmpx/tmpy,tmpg) are not in the pattern; they’re outs allocated by RA. +//===----------------------------------------------------------------------===// + +let Predicates = [HasSSE1] in { + + // 128-bit float (bitwise-equivalent ops in expander) + def : Pat<(v4f32 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V4F32 VR128:$t, VR128:$f, timm:$cc)>; +} + +let Predicates = [HasSSE2] in { + + // 128-bit integer + def : Pat<(v4i32 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V4I32 VR128:$t, VR128:$f, timm:$cc)>; + def : Pat<(v2i64 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V2I64 VR128:$t, VR128:$f, timm:$cc)>; + def : Pat<(v8i16 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V8I16 VR128:$t, VR128:$f, timm:$cc)>; + def : Pat<(v16i8 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V16I8 VR128:$t, VR128:$f, timm:$cc)>; + def : Pat<(v2f64 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V2F64 VR128:$t, VR128:$f, timm:$cc)>; + + // 128-bit f16 (optional) + def : Pat<(v8f16 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V8F16 VR128:$t, VR128:$f, timm:$cc)>; +} + +let Predicates = [HasAVX] in { + + // 256-bit integer + def : Pat<(v8i32 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V8I32 VR256:$t, VR256:$f, timm:$cc)>; + def : Pat<(v4i64 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V4I64 VR256:$t, VR256:$f, timm:$cc)>; + def : Pat<(v16i16 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V16I16 VR256:$t, VR256:$f, timm:$cc)>; + def : Pat<(v32i8 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V32I8 VR256:$t, VR256:$f, timm:$cc)>; + + // 256-bit float (bitwise-equivalent ops in expander) + def : Pat<(v8f32 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V8F32 VR256:$t, VR256:$f, timm:$cc)>; + def : Pat<(v4f64 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V4F64 VR256:$t, VR256:$f, timm:$cc)>; + + // 256-bit f16 (optional) + def : Pat<(v16f16 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)), + (CTSELECT_V16F16 VR256:$t, VR256:$f, timm:$cc)>; +} + let Predicates = [HasCMOV, HasCF] in { def : Pat<(X86cmov GR16:$src1, 0, timm:$cond, EFLAGS), (CFCMOV16rr GR16:$src1, (inv_cond_XFORM timm:$cond))>; diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index ec31675731b79..f4163f55d66ce 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -693,6 +693,86 @@ def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>; +// CTSELECT +// Enhanced CTSELECT pseudos for i386 with temporary register allocation +// These use a two-phase approach: +// 1. Custom inserter materializes condition byte from EFLAGS +// 2. Post-RA expansion generates constant-time instruction bundles + +let isPseudo = 1, isNotDuplicable = 1 in { + // Phase 1: Initial pseudos that consume EFLAGS (via custom inserter) + // These are matched by patterns and convert EFLAGS to condition byte + class CTSELECT_I386_INITIAL + : PseudoI<(outs RC:$dst), + (ins RC:$src1, RC:$src2, i8imm:$cond), + [(set RC:$dst, (VT(X86ctselect RC:$src1, RC:$src2, timm:$cond, + EFLAGS)))]> { + let Uses = [EFLAGS]; + let Defs = [EFLAGS]; + let usesCustomInserter = 1; + let hasNoSchedulingInfo = 1; + } + + // Phase 2: Internal pseudos with pre-materialized condition byte (post-RA expansion) + // These generate the actual constant-time instruction bundles + class CTSELECT_I386_INTERNAL + : PseudoI<(outs RC:$dst, ByteRC:$tmp_byte, RC:$tmp_mask), + (ins RC:$src1, RC:$src2, ByteRC:$cond_byte), []> { + let hasNoSchedulingInfo = 1; + let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_byte,@earlyclobber $tmp_mask"; + } +} + +// Phase 1 pseudos for non-CMOV targets (custom inserter materializes condition) +let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in { + let Predicates = [NoNativeCMOV] in { + def CTSELECT_I386_GR8rr : CTSELECT_I386_INITIAL; + def CTSELECT_I386_GR16rr : CTSELECT_I386_INITIAL; + def CTSELECT_I386_GR32rr : CTSELECT_I386_INITIAL; + } +} + +// Phase 2 pseudos (post-RA expansion with pre-materialized condition byte) +let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in { + let Predicates = [NoNativeCMOV] in { + def CTSELECT_I386_INT_GR8rr : + CTSELECT_I386_INTERNAL; + def CTSELECT_I386_INT_GR16rr : + CTSELECT_I386_INTERNAL; + def CTSELECT_I386_INT_GR32rr : + CTSELECT_I386_INTERNAL; + } +} + +let hasSideEffects = 1, + ForceDisassemble = 1, + Constraints = "$dst = $src1" in { + + let Predicates = [FPStackf32] in + def CTSELECT_I386_FP32rr : CTSELECT_I386_INITIAL; + + let Predicates = [FPStackf64] in + def CTSELECT_I386_FP64rr : CTSELECT_I386_INITIAL; + + def CTSELECT_I386_FP80rr : CTSELECT_I386_INITIAL; +} + +// Pattern matching for non-native-CMOV CTSELECT (routes to custom inserter for condition materialization) +// NoNativeCMOV ensures these patterns are used when actual CMOV instruction is not available +// even if canUseCMOV() is true (e.g., i386 with SSE which can emulate CMOV) +let Predicates = [NoNativeCMOV] in { + def : Pat<(i8(X86ctselect GR8:$src1, GR8:$src2, timm:$cond, EFLAGS)), + (CTSELECT_I386_GR8rr GR8:$src1, GR8:$src2, timm:$cond)>; + + def : Pat<(i16(X86ctselect GR16:$src1, GR16:$src2, timm:$cond, EFLAGS)), + (CTSELECT_I386_GR16rr GR16:$src1, GR16:$src2, timm:$cond)>; + + def : Pat<(i32(X86ctselect GR32:$src1, GR32:$src2, timm:$cond, EFLAGS)), + (CTSELECT_I386_GR32rr GR32:$src1, GR32:$src2, timm:$cond)>; + + // i64 patterns handled automatically by type legalization +} + //===----------------------------------------------------------------------===// // Normal-Instructions-With-Lock-Prefix Pseudo Instructions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td index 116986a0fffea..4c9e5bae3b46c 100644 --- a/llvm/lib/Target/X86/X86InstrFragments.td +++ b/llvm/lib/Target/X86/X86InstrFragments.td @@ -28,6 +28,10 @@ def SDTX86Cmov : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; +def SDTX86CtSelect : SDTypeProfile<1, 4, + [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, + SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; + // Unary and binary operator instructions that set EFLAGS as a side-effect. def SDTUnaryArithWithFlags : SDTypeProfile<2, 1, [SDTCisSameAs<0, 2>, @@ -151,6 +155,7 @@ def X86ctest : SDNode<"X86ISD::CTEST", SDTX86Ccmp>; def X86cload : SDNode<"X86ISD::CLOAD", SDTX86Cload, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def X86cstore : SDNode<"X86ISD::CSTORE", SDTX86Cstore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def X86ctselect: SDNode<"X86ISD::CTSELECT", SDTX86CtSelect, [SDNPInGlue]>; def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>; def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond, [SDNPHasChain]>; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 1d2cd39951bf4..ef270fc49a224 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -475,6 +475,556 @@ bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op, return false; } +struct CtSelectInstructions { + unsigned PAndOpc; + unsigned PAndnOpc; + unsigned POrOpc; + unsigned BroadcastOpc; + unsigned IntMoveOpc; + unsigned MoveOpc; + bool Use256; + bool UseBlendInstr; +}; + +static CtSelectInstructions +getCtSelectInstructions(unsigned Opcode, const X86Subtarget &Subtarget) { + CtSelectInstructions Instructions = {}; + + switch (Opcode) { + case X86::CTSELECT_V2F64: + if (Subtarget.hasSSE2()) { + Instructions.PAndOpc = X86::PANDrr; + Instructions.PAndnOpc = X86::PANDNrr; + Instructions.POrOpc = X86::PORrr; + Instructions.BroadcastOpc = X86::PSHUFDri; + Instructions.IntMoveOpc = X86::MOVDI2PDIrr; + Instructions.MoveOpc = X86::MOVAPDrr; + Instructions.UseBlendInstr = true; + } else { + llvm_unreachable("Double precision vectors require SSE2"); + } + break; + case X86::CTSELECT_V4F32: + if (Subtarget.hasSSE41()) { + Instructions.PAndOpc = X86::PANDrr; + Instructions.PAndnOpc = X86::PANDNrr; + Instructions.POrOpc = X86::PORrr; + Instructions.BroadcastOpc = X86::PSHUFDri; + Instructions.IntMoveOpc = X86::MOVDI2PDIrr; + Instructions.MoveOpc = X86::MOVAPSrr; + Instructions.UseBlendInstr = true; + } else if (Subtarget.hasSSE2()) { + Instructions.PAndOpc = X86::PANDrr; + Instructions.PAndnOpc = X86::PANDNrr; + Instructions.POrOpc = X86::PORrr; + Instructions.BroadcastOpc = X86::PSHUFDri; + Instructions.IntMoveOpc = X86::MOVDI2PDIrr; + Instructions.MoveOpc = X86::MOVAPSrr; + } else { + // fallback to SSE1, only support four 32-bit single precision + // floating-point values + Instructions.PAndOpc = X86::ANDPSrr; + Instructions.PAndnOpc = X86::ANDNPSrr; + Instructions.POrOpc = X86::ORPSrr; + Instructions.BroadcastOpc = X86::SHUFPSrri; + Instructions.IntMoveOpc = X86::MOVSS2DIrr; + Instructions.MoveOpc = X86::MOVAPSrr; + } + break; + case X86::CTSELECT_V4I32: + case X86::CTSELECT_V2I64: + case X86::CTSELECT_V8I16: + case X86::CTSELECT_V16I8: + if (Subtarget.hasSSE2()) { + Instructions.PAndOpc = X86::PANDrr; + Instructions.PAndnOpc = X86::PANDNrr; + Instructions.POrOpc = X86::PORrr; + Instructions.BroadcastOpc = X86::PSHUFDri; + Instructions.IntMoveOpc = X86::MOVDI2PDIrr; + Instructions.MoveOpc = X86::MOVDQArr; + } else { + llvm_unreachable("Integer vector operations require SSE2"); + } + break; + case X86::CTSELECT_V8F16: + if (Subtarget.hasSSE2()) { + Instructions.PAndOpc = X86::PANDrr; + Instructions.PAndnOpc = X86::PANDNrr; + Instructions.POrOpc = X86::PORrr; + Instructions.BroadcastOpc = X86::PSHUFDri; + Instructions.IntMoveOpc = X86::MOVDI2PDIrr; + Instructions.MoveOpc = X86::MOVDQArr; + } else { + llvm_unreachable("FP16 vector operations require SSE2"); + } + break; + case X86::CTSELECT_V4F32X: + case X86::CTSELECT_V4I32X: + case X86::CTSELECT_V2F64X: + case X86::CTSELECT_V2I64X: + case X86::CTSELECT_V8I16X: + case X86::CTSELECT_V16I8X: + case X86::CTSELECT_V8F16X: + if (Subtarget.hasAVX()) { + Instructions.PAndOpc = X86::VPANDrr; + Instructions.PAndnOpc = X86::VPANDNrr; + Instructions.POrOpc = X86::VPORrr; + Instructions.BroadcastOpc = X86::VPSHUFDri; + Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; + Instructions.MoveOpc = (Opcode == X86::CTSELECT_V4F32X) ? X86::VMOVAPSrr + : (Opcode == X86::CTSELECT_V2F64X) + ? X86::VMOVAPDrr + : X86::VMOVDQArr; + } else { + llvm_unreachable("AVX variants require AVX support"); + } + break; + case X86::CTSELECT_V8F32: + case X86::CTSELECT_V8I32: + if (Subtarget.hasAVX()) { + Instructions.PAndOpc = X86::VPANDYrr; + Instructions.PAndnOpc = X86::VPANDNYrr; + Instructions.POrOpc = X86::VPORYrr; + Instructions.BroadcastOpc = X86::VPERMILPSYri; + Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; + Instructions.MoveOpc = + (Opcode == X86::CTSELECT_V8F32) ? X86::VMOVAPSYrr : X86::VMOVDQAYrr; + Instructions.Use256 = true; + } else { + llvm_unreachable("256-bit vectors require AVX"); + } + break; + case X86::CTSELECT_V4F64: + case X86::CTSELECT_V4I64: + if (Subtarget.hasAVX()) { + Instructions.PAndOpc = X86::VPANDYrr; + Instructions.PAndnOpc = X86::VPANDNYrr; + Instructions.POrOpc = X86::VPORYrr; + Instructions.BroadcastOpc = X86::VPERMILPDYri; + Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; + Instructions.MoveOpc = + (Opcode == X86::CTSELECT_V4F64) ? X86::VMOVAPDYrr : X86::VMOVDQAYrr; + Instructions.Use256 = true; + } else { + llvm_unreachable("256-bit vectors require AVX"); + } + break; + case X86::CTSELECT_V16I16: + case X86::CTSELECT_V32I8: + case X86::CTSELECT_V16F16: + if (Subtarget.hasAVX2()) { + Instructions.PAndOpc = X86::VPANDYrr; + Instructions.PAndnOpc = X86::VPANDNYrr; + Instructions.POrOpc = X86::VPORYrr; + Instructions.BroadcastOpc = X86::VPERMILPSYri; + Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; + Instructions.MoveOpc = X86::VMOVDQAYrr; + Instructions.Use256 = true; + } else if (Subtarget.hasAVX()) { + Instructions.PAndOpc = X86::VPANDYrr; + Instructions.PAndnOpc = X86::VPANDNYrr; + Instructions.POrOpc = X86::VPORYrr; + Instructions.BroadcastOpc = X86::VPERMILPSYri; + Instructions.IntMoveOpc = X86::VMOVDI2PDIrr; + Instructions.MoveOpc = X86::VMOVDQAYrr; + Instructions.Use256 = true; + } else { + llvm_unreachable("256-bit integer vectors require AVX"); + } + break; + default: + llvm_unreachable("Unexpected CTSELECT opcode"); + } + + return Instructions; +} + +bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const { + unsigned Opcode = MI.getOpcode(); + const DebugLoc &DL = MI.getDebugLoc(); + auto Instruction = getCtSelectInstructions(Opcode, Subtarget); + + MachineBasicBlock *MBB = MI.getParent(); + + // Operand layout matches the TableGen definition: + // (outs VR128:$dst, VR128:$tmpx, GR32:$tmpg), + // (ins VR128:$t, VR128:$f, i8imm:$cond) + Register Dst = MI.getOperand(0).getReg(); + Register MaskReg = MI.getOperand(1).getReg(); // vector mask temp + Register TmpGPR = MI.getOperand(2).getReg(); // scalar mask temp (GPR32) + Register FalseVal = MI.getOperand(3).getReg(); // true_value + Register TrueVal = MI.getOperand(4).getReg(); // false_value + X86::CondCode CC = X86::CondCode(MI.getOperand(5).getImm()); // condition + + MachineInstr *FirstInstr = nullptr; + MachineInstr *LastInstr = nullptr; + auto recordInstr = [&](MachineInstrBuilder MIB) { + MachineInstr *NewMI = MIB.getInstr(); + LastInstr = NewMI; + if (!FirstInstr) + FirstInstr = NewMI; + }; + + // Create scalar mask in tempGPR and broadcast to vector mask + recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOV32ri), TmpGPR) + .addImm(0) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + const TargetRegisterInfo *TRI = &getRegisterInfo(); + auto SubReg = TRI->getSubReg(TmpGPR, X86::sub_8bit); + recordInstr(BuildMI(*MBB, MI, DL, get(X86::SETCCr)) + .addReg(SubReg) + .addImm(CC) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + // Zero-extend byte to 32-bit register (movzbl %al, %eax) + recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOVZX32rr8), TmpGPR) + .addReg(SubReg) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + if (Instruction.UseBlendInstr && Subtarget.hasSSE41()) { + // Shift left 31 bits to convert 1 -> 0x80000000, 0 -> 0x00000000 (shll $31, + // %eax) + recordInstr(BuildMI(*MBB, MI, DL, get(X86::SHL32ri), TmpGPR) + .addReg(TmpGPR) + .addImm(31)); + } else { + // Negate to convert 1 -> 0xFFFFFFFF, 0 -> 0x00000000 (negl %eax) + recordInstr(BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR) + .addReg(TmpGPR)); + } + + // Broadcast to TmpX (vector mask) + recordInstr(BuildMI(*MBB, MI, DL, get(X86::PXORrr), MaskReg) + .addReg(MaskReg) + .addReg(MaskReg) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + // Move scalar mask to vector register + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.IntMoveOpc), MaskReg) + .addReg(TmpGPR) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + if (Instruction.Use256) { + // Broadcast to 256-bit vector register + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg) + .addReg(MaskReg) + .addImm(0) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + } else { + if (Subtarget.hasSSE2() || Subtarget.hasAVX()) { + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg) + .addReg(MaskReg) + .addImm(0x00) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + } else { + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg) + .addReg(MaskReg) + .addReg(MaskReg) + .addImm(0x00) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + } + } + + if (Instruction.UseBlendInstr && Subtarget.hasSSE41()) { + // Use dedicated blend instructions for SSE4.1+ + unsigned BlendOpc; + switch (Opcode) { + case X86::CTSELECT_V4F32: + BlendOpc = X86::BLENDVPSrr0; + break; + case X86::CTSELECT_V2F64: + BlendOpc = X86::BLENDVPDrr0; + break; + default: + // alias for pblendvb that takes xmm0 as implicit mask register + BlendOpc = X86::PBLENDVBrr0; + break; + } + + // Check if XMM0 is used as one of source registers, if yes then save it + // in Dst register and update FalseVal and TrueVal to Dst register + bool DidSaveXMM0 = false; + Register SavedXMM0 = X86::XMM0; + if (FalseVal == X86::XMM0 || TrueVal == X86::XMM0) { + Register SrcXMM0 = (FalseVal == X86::XMM0) ? FalseVal : TrueVal; + + // if XMM0 is one of the source registers, it will not match with Dst + // registers, so we need to move it to Dst register + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) + .addReg(SrcXMM0) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + // update FalseVal and TrueVal to Dst register + if (FalseVal == X86::XMM0) + FalseVal = Dst; + if (TrueVal == X86::XMM0) + TrueVal = Dst; + + // update SavedXMM0 to Dst register + SavedXMM0 = Dst; + + // set DidSaveXMM0 to true to indicate that we saved XMM0 into Dst + // register + DidSaveXMM0 = true; + } else if (MaskReg != X86::XMM0 && Dst != X86::XMM0) { + + // if XMM0 is not allocated for any of the register, we stil need to save + // and restore it after using as mask register + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) + .addReg(X86::XMM0) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + SavedXMM0 = Dst; + DidSaveXMM0 = true; + } + + if (MaskReg != X86::XMM0) { + // BLENDV uses XMM0 as implicit mask register + // https://www.felixcloutier.com/x86/pblendvb + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), X86::XMM0) + .addReg(MaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge)); + + // move FalseVal to mask (use MaskReg as the dst of the blend) + recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), MaskReg) + .addReg(FalseVal) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + // MaskReg := blend(MaskReg /*false*/, TrueVal /*true*/) ; mask in + // xmm0 + recordInstr(BuildMI(*MBB, MI, DL, get(BlendOpc), MaskReg) + .addReg(MaskReg) + .addReg(TrueVal) + .addReg(X86::XMM0) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + // restore XMM0 from SavedXMM0 if we saved it into Dst + if (DidSaveXMM0) { + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), X86::XMM0) + .addReg(SavedXMM0) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + } + // dst = result (now in MaskReg) + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) + .addReg(MaskReg) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + } else { + // move FalseVal to Dst register since MaskReg is XMM0 and Dst is not + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) + .addReg(FalseVal) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + // Dst := blend(Dst /*false*/, TrueVal /*true*/) ; mask in + // xmm0 + recordInstr(BuildMI(*MBB, MI, DL, get(BlendOpc), Dst) + .addReg(Dst) + .addReg(TrueVal) + .addReg(X86::XMM0) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + } + } else { + + // dst = mask + recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst) + .addReg(MaskReg) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + // mask &= true_val + recordInstr(BuildMI(*MBB, MI, DL, get(X86::PANDrr), MaskReg) + .addReg(MaskReg) + .addReg(TrueVal) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + // dst = ~mask & false_val + recordInstr(BuildMI(*MBB, MI, DL, get(X86::PANDNrr), Dst) + .addReg(Dst) + .addReg(FalseVal) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + + // dst |= mask; (mask & t) | (~mask & f) + recordInstr(BuildMI(*MBB, MI, DL, get(X86::PORrr), Dst) + .addReg(Dst) + .addReg(MaskReg) + .setMIFlags(MachineInstr::MIFlag::NoMerge)); + } + + assert(FirstInstr && LastInstr && "Expected at least one expanded instruction"); + auto BundleEnd = LastInstr->getIterator(); + finalizeBundle(*MBB, FirstInstr->getIterator(), std::next(BundleEnd)); + + MI.eraseFromParent(); + + return true; +} + +bool X86InstrInfo::expandCtSelectWithCMOV(MachineInstr &MI) const { + MachineBasicBlock *MBB = MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + // CTSELECT pseudo has: (outs dst), (ins true_val, false_val, cond) + MachineOperand &OperandRes = MI.getOperand(0); // destination register + MachineOperand &OperandTrue = MI.getOperand(1); // true value + MachineOperand &OperandCond = MI.getOperand(3); // condition code + + assert(OperandTrue.isReg() && OperandRes.isReg() && OperandCond.isImm() && + "Invalid operand types"); + assert(OperandTrue.getReg() == OperandRes.getReg() && + "Result register different from True register"); + + assert(Subtarget.hasCMOV() && "target does not support CMOV instructions"); + + unsigned Opcode = 0; + + switch (MI.getOpcode()) { + case X86::CTSELECT16rr: + Opcode = X86::CMOV16rr; + break; + case X86::CTSELECT32rr: + Opcode = X86::CMOV32rr; + break; + case X86::CTSELECT64rr: + Opcode = X86::CMOV64rr; + break; + case X86::CTSELECT16rm: + Opcode = X86::CMOV16rm; + break; + case X86::CTSELECT32rm: + Opcode = X86::CMOV32rm; + break; + case X86::CTSELECT64rm: + Opcode = X86::CMOV64rm; + break; + default: + llvm_unreachable("Invalid CTSELECT opcode"); + } + + if (!Subtarget.hasCMOV()) { + llvm_unreachable("target does not support cmov"); + } + + // Build CMOV instruction: copy the first 3 operands (dst, true, false) + // and add condition code + MachineInstrBuilder CmovBuilder = BuildMI(*MBB, MI, DL, get(Opcode)); + for (unsigned i = 0u; i < MI.getNumOperands(); ++i) { // Copy + CmovBuilder.add(MI.getOperand(i)); + } + + // Remove the original CTSELECT instruction + MI.eraseFromParent(); + return true; +} + +/// Expand i386-specific CTSELECT pseudo instructions (post-RA, constant-time) +/// These internal pseudos receive a pre-materialized condition byte from the +/// custom inserter, avoiding EFLAGS corruption issues during i64 type legalization. +bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const { + MachineBasicBlock *MBB = MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + // CTSELECT_I386_INT_GRxxrr has operands: (outs dst, tmp_byte, tmp_mask), + // (ins src1, src2, cond_byte) + // Note: cond_byte is pre-materialized by custom inserter, not EFLAGS-dependent + Register DstReg = MI.getOperand(0).getReg(); + Register TmpByteReg = MI.getOperand(1).getReg(); + Register TmpMaskReg = MI.getOperand(2).getReg(); + Register Src1Reg = MI.getOperand(3).getReg(); + Register Src2Reg = MI.getOperand(4).getReg(); + Register CondByteReg = MI.getOperand(5).getReg(); // Pre-materialized condition byte + + // Determine instruction opcodes based on register width + unsigned MovZXOp, NegOp, MovOp, AndOp, NotOp, OrOp; + if (MI.getOpcode() == X86::CTSELECT_I386_INT_GR8rr) { + MovZXOp = 0; // No zero-extend needed for GR8 + NegOp = X86::NEG8r; + MovOp = X86::MOV8rr; + AndOp = X86::AND8rr; + NotOp = X86::NOT8r; + OrOp = X86::OR8rr; + } else if (MI.getOpcode() == X86::CTSELECT_I386_INT_GR16rr) { + MovZXOp = X86::MOVZX16rr8; + NegOp = X86::NEG16r; + MovOp = X86::MOV16rr; + AndOp = X86::AND16rr; + NotOp = X86::NOT16r; + OrOp = X86::OR16rr; + } else { // X86::CTSELECT_I386_INT_GR32rr + MovZXOp = X86::MOVZX32rr8; + NegOp = X86::NEG32r; + MovOp = X86::MOV32rr; + AndOp = X86::AND32rr; + NotOp = X86::NOT32r; + OrOp = X86::OR32rr; + } + + // 7-instruction constant-time selection bundle (no SETCC inside): + // result = (true_val & mask) | (false_val & ~mask) + // The condition byte is already materialized, avoiding EFLAGS dependency + + // Step 1: Copy pre-materialized condition byte to TmpByteReg + // This allows the bundle to work with allocated temporaries + auto I1 = BuildMI(*MBB, MI, DL, get(X86::MOV8rr), TmpByteReg) + .addReg(CondByteReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + auto BundleStart = I1->getIterator(); + + // Step 2: Zero-extend condition byte to register width (0 or 1) + if (MI.getOpcode() != X86::CTSELECT_I386_INT_GR8rr) { + BuildMI(*MBB, MI, DL, get(MovZXOp), TmpMaskReg) + .addReg(TmpByteReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + } + + // Step 3: Convert condition to bitmask (NEG: 1 -> 0xFFFF..., 0 -> 0x0000...) + Register MaskReg = (MI.getOpcode() == X86::CTSELECT_I386_INT_GR8rr) ? TmpByteReg : TmpMaskReg; + BuildMI(*MBB, MI, DL, get(NegOp), MaskReg) + .addReg(MaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Step 4,5: Apply mask to true value - copy src1 to dest, then AND with mask + BuildMI(*MBB, MI, DL, get(MovOp), DstReg) + .addReg(Src1Reg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + BuildMI(*MBB, MI, DL, get(AndOp), DstReg) + .addReg(DstReg) + .addReg(MaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Step 6: Create inverted mask inline (~mask) + BuildMI(*MBB, MI, DL, get(NotOp), MaskReg) + .addReg(MaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Step 7: Apply inverted mask to false value - reuse mask register directly + BuildMI(*MBB, MI, DL, get(AndOp), MaskReg) + .addReg(MaskReg) + .addReg(Src2Reg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Step 8: Final result: (src1 & mask) | (src2 & ~mask) + auto LI = BuildMI(*MBB, MI, DL, get(OrOp), DstReg) + .addReg(DstReg) + .addReg(MaskReg) + .setMIFlag(MachineInstr::MIFlag::NoMerge); + + // Bundle all generated instructions for atomic execution before removing MI + auto BundleEnd = std::next(LI->getIterator()); + if (BundleStart != BundleEnd) { + // Only bundle if we have multiple instructions + finalizeBundle(*MBB, BundleStart, BundleEnd); + } + + // TODO: Optimization opportunity - The register allocator may choose callee-saved + // registers (e.g., %ebx, %esi) for TmpByteReg/TmpMaskReg, causing unnecessary + // save/restore overhead. Consider constraining these to caller-saved register + // classes (e.g., GR8_AL, GR32_CallSaved) in the TableGen definitions to improve + // constant-time performance by eliminating prologue/epilogue instructions. + + // Remove the original pseudo instruction + MI.eraseFromParent(); + return true; +} + static bool isFrameLoadOpcode(int Opcode, TypeSize &MemBytes) { switch (Opcode) { default: @@ -6411,6 +6961,43 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::ADD64ri32_DB: MIB->setDesc(get(X86::OR64ri32)); break; + + case X86::CTSELECT64rr: + case X86::CTSELECT32rr: + case X86::CTSELECT16rr: + case X86::CTSELECT64rm: + case X86::CTSELECT32rm: + case X86::CTSELECT16rm: + // These CTSELECT pseudos are only selected when CMOV is available + // Pattern matching ensures we use CTSELECT_I386 when CMOV is not available + return expandCtSelectWithCMOV(MI); + + // non-cmov CTSELECT expansion (post-RA, constant-time) + // These are the internal pseudos with pre-materialized condition byte + case X86::CTSELECT_I386_INT_GR8rr: + case X86::CTSELECT_I386_INT_GR16rr: + case X86::CTSELECT_I386_INT_GR32rr: + return expandCtSelectIntWithoutCMOV(MI); + + case X86::CTSELECT_V2F64: + case X86::CTSELECT_V4F32: + case X86::CTSELECT_V2I64: + case X86::CTSELECT_V4I32: + case X86::CTSELECT_V8I16: + case X86::CTSELECT_V16I8: + case X86::CTSELECT_V2F64X: + case X86::CTSELECT_V4F32X: + case X86::CTSELECT_V2I64X: + case X86::CTSELECT_V4I32X: + case X86::CTSELECT_V8I16X: + case X86::CTSELECT_V16I8X: + case X86::CTSELECT_V4I64: + case X86::CTSELECT_V8I32: + case X86::CTSELECT_V16I16: + case X86::CTSELECT_V32I8: + case X86::CTSELECT_V4F64: + case X86::CTSELECT_V8F32: + return expandCtSelectVector(MI); } return false; } diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 5f75559bd9598..ebd7e070d5fe8 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -724,6 +724,12 @@ class X86InstrInfo final : public X86GenInstrInfo { bool isFrameOperand(const MachineInstr &MI, unsigned int Op, int &FrameIndex) const; + /// Expand the CTSELECT pseudo-instructions. + bool expandCtSelectWithCMOV(MachineInstr &MI) const; + bool expandCtSelectIntWithoutCMOV(MachineInstr &MI) const; + + bool expandCtSelectVector(MachineInstr &MI) const; + /// Returns true iff the routine could find two commutable operands in the /// given machine instruction with 3 vector inputs. /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index c20bb05018b4d..23841034ed411 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -49,6 +49,11 @@ def HasZU : Predicate<"Subtarget->hasZU()">; def HasCF : Predicate<"Subtarget->hasCF()">; def HasCMOV : Predicate<"Subtarget->canUseCMOV()">; def NoCMOV : Predicate<"!Subtarget->canUseCMOV()">; +// Predicates for native CMOV instruction (checks hasCMOV(), not canUseCMOV()) +// HasCMOV may be true even without native CMOV (e.g., via SSE emulation) +// Use HasNativeCMOV/NoNativeCMOV for constant-time code that requires actual CMOV +def HasNativeCMOV : Predicate<"Subtarget->hasCMOV()">; +def NoNativeCMOV : Predicate<"!Subtarget->hasCMOV()">; def HasNOPL : Predicate<"Subtarget->hasNOPL()">; def HasMMX : Predicate<"Subtarget->hasMMX()">; def HasSSE1 : Predicate<"Subtarget->hasSSE1()">; diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 8dd6f3d97ccea..a776b54912c16 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -617,10 +617,11 @@ void X86PassConfig::addPreEmitPass2() { // ObjC runtime functions present in the module. const Function &F = MF.getFunction(); const Module *M = F.getParent(); - return M->getModuleFlag("kcfi") || + return M->getModuleFlag("kcfi") || F.hasFnAttribute("ct-select") || (TT.isOSDarwin() && (M->getFunction("objc_retainAutoreleasedReturnValue") || - M->getFunction("objc_unsafeClaimAutoreleasedReturnValue"))); + M->getFunction("objc_unsafeClaimAutoreleasedReturnValue"))) || + F.hasFnAttribute("ct-select"); })); // Analyzes and emits pseudos to support Win x64 Unwind V2. This pass must run diff --git a/llvm/test/CodeGen/AArch64/ctselect.ll b/llvm/test/CodeGen/AArch64/ctselect.ll new file mode 100644 index 0000000000000..4cde9fe8a866a --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ctselect.ll @@ -0,0 +1,125 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-eabi | FileCheck %s --check-prefixes=DEFAULT,NOFP16 +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-eabi -mattr=+fullfp16 | FileCheck %s --check-prefixes=DEFAULT,FP16 + +define i1 @ct_i1(i1 %cond, i1 %a, i1 %b) { +; DEFAULT-LABEL: ct_i1: +; DEFAULT: csel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{mov|ldr}} + %1 = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b) + ret i1 %1 +} + +define i8 @ct_i8(i1 %cond, i8 %a, i8 %b) { +; DEFAULT-LABEL: ct_i8: +; DEFAULT: csel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{mov|ldr}} + %1 = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) + ret i8 %1 +} + +define i16 @ct_i16(i1 %cond, i16 %a, i16 %b) { +; DEFAULT-LABEL: ct_i16: +; DEFAULT: csel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{mov|ldr}} + %1 = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) + ret i16 %1 +} + +define i32 @ct_i32(i1 %cond, i32 %a, i32 %b) { +; DEFAULT-LABEL: ct_i32: +; DEFAULT: csel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{mov|ldr}} + %1 = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %1 +} + +define i64 @ct_i64(i1 %cond, i64 %a, i64 %b) { +; DEFAULT-LABEL: ct_i64: +; DEFAULT: csel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{mov|ldr}} + %1 = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b) + ret i64 %1 +} + +define i128 @ct_i128(i1 %cond, i128 %a, i128 %b) { +; DEFAULT-LABEL: ct_i128: +; DEFAULT: csel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{mov|ldr}} +; DEFAULT: csel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{mov|ldr}} + %1 = call i128 @llvm.ct.select.i128(i1 %cond, i128 %a, i128 %b) + ret i128 %1 +} + +define half @ct_f16(i1 %cond, half %a, half %b) { +; DEFAULT-LABEL: ct_f16: +; NOFP16: fcvt +; NOFP16: csel +; FP16: fcsel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{mov|ldr}} +; NOFP16: fcvt + %1 = call half @llvm.ct.select.f16(i1 %cond, half %a, half %b) + ret half %1 +} + +define float @ct_f32(i1 %cond, float %a, float %b) { +; DEFAULT-LABEL: ct_f32: +; DEFAULT: fcsel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{mov|ldr}} + %1 = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %1 +} + +define double @ct_f64(i1 %cond, double %a, double %b) { +; DEFAULT-LABEL: ct_f64: +; DEFAULT: fcsel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{mov|ldr}} + %1 = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) + ret double %1 +} + +define <4 x i32> @ct_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { +; DEFAULT-LABEL: ct_v4i32: +; DEFAULT: csel +; DEFAULT: csel +; DEFAULT: csel +; DEFAULT: csel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{ldr}} + %1 = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %1 +} + +define <4 x float> @ct_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) { +; DEFAULT-LABEL: ct_v4f32: +; DEFAULT: fcsel +; DEFAULT: fcsel +; DEFAULT: fcsel +; DEFAULT: fcsel +; DEFAULT-NOT: b{{eq|ne}} +; DEFAULT-NOT: j +; DEFAULT-NOT: {{ldr}} + %1 = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) + ret <4 x float> %1 +} \ No newline at end of file diff --git a/llvm/test/CodeGen/ARM/ctselect-half.ll b/llvm/test/CodeGen/ARM/ctselect-half.ll new file mode 100644 index 0000000000000..f75707fc91af3 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ctselect-half.ll @@ -0,0 +1,975 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=armv7-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=CT %s +; RUN: llc < %s -mtriple=armv8.6a-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=BFLOAT-F16-NATIVE %s +; RUN: llc < %s -mtriple=armv8.2a-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=F16-NATIVE %s +; RUN: llc < %s -mtriple=thumbv6m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB1 %s +; RUN: llc < %s -mtriple=thumbv7m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB2 %s + +define half @ct_half(i1 %cond, half %a, half %b) { +; CT-LABEL: ct_half: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r3, r0, #1 +; CT-NEXT: rsb r12, r3, #0 +; CT-NEXT: and r0, r1, r12 +; CT-NEXT: bic r12, r2, r12 +; CT-NEXT: orr r0, r0, r12 +; CT-NEXT: bx lr +; +; BFLOAT-F16-NATIVE-LABEL: ct_half: +; BFLOAT-F16-NATIVE: @ %bb.0: @ %entry +; BFLOAT-F16-NATIVE-NEXT: and r3, r0, #1 +; BFLOAT-F16-NATIVE-NEXT: rsb r12, r3, #0 +; BFLOAT-F16-NATIVE-NEXT: and r0, r1, r12 +; BFLOAT-F16-NATIVE-NEXT: bic r12, r2, r12 +; BFLOAT-F16-NATIVE-NEXT: orr r0, r0, r12 +; BFLOAT-F16-NATIVE-NEXT: bx lr +; +; F16-NATIVE-LABEL: ct_half: +; F16-NATIVE: @ %bb.0: @ %entry +; F16-NATIVE-NEXT: and r3, r0, #1 +; F16-NATIVE-NEXT: rsb r12, r3, #0 +; F16-NATIVE-NEXT: and r0, r1, r12 +; F16-NATIVE-NEXT: bic r12, r2, r12 +; F16-NATIVE-NEXT: orr r0, r0, r12 +; F16-NATIVE-NEXT: bx lr +; +; THUMB1-LABEL: ct_half: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: movs r3, #1 +; THUMB1-NEXT: ands r3, r0 +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: lsls r4, r4, #31 +; THUMB1-NEXT: asrs r4, r4, #31 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r2 +; THUMB1-NEXT: ands r0, r4 +; THUMB1-NEXT: eors r0, r2 +; THUMB1-NEXT: pop {r4, pc} +; +; THUMB2-LABEL: ct_half: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: and r3, r0, #1 +; THUMB2-NEXT: rsb.w r12, r3, #0 +; THUMB2-NEXT: and.w r0, r1, r12 +; THUMB2-NEXT: bic.w r12, r2, r12 +; THUMB2-NEXT: orr.w r0, r0, r12 +; THUMB2-NEXT: bx lr +entry: + %sel = call half @llvm.ct.select.f16(i1 %cond, half %a, half %b) + ret half %sel +} + +define bfloat @ct_bf16(i1 %cond, bfloat %a, bfloat %b) { +; CT-LABEL: ct_bf16: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r3, r0, #1 +; CT-NEXT: rsb r12, r3, #0 +; CT-NEXT: and r0, r1, r12 +; CT-NEXT: bic r12, r2, r12 +; CT-NEXT: orr r0, r0, r12 +; CT-NEXT: bx lr +; +; BFLOAT-F16-NATIVE-LABEL: ct_bf16: +; BFLOAT-F16-NATIVE: @ %bb.0: @ %entry +; BFLOAT-F16-NATIVE-NEXT: .pad #4 +; BFLOAT-F16-NATIVE-NEXT: sub sp, sp, #4 +; BFLOAT-F16-NATIVE-NEXT: and r0, r0, #1 +; BFLOAT-F16-NATIVE-NEXT: rsb r12, r0, #0 +; BFLOAT-F16-NATIVE-NEXT: and r3, r1, r12 +; BFLOAT-F16-NATIVE-NEXT: bic r12, r2, r12 +; BFLOAT-F16-NATIVE-NEXT: orr r3, r3, r12 +; BFLOAT-F16-NATIVE-NEXT: strh r3, [sp, #2] +; BFLOAT-F16-NATIVE-NEXT: ldrh r0, [sp, #2] +; BFLOAT-F16-NATIVE-NEXT: add sp, sp, #4 +; BFLOAT-F16-NATIVE-NEXT: bx lr +; +; F16-NATIVE-LABEL: ct_bf16: +; F16-NATIVE: @ %bb.0: @ %entry +; F16-NATIVE-NEXT: and r3, r0, #1 +; F16-NATIVE-NEXT: rsb r12, r3, #0 +; F16-NATIVE-NEXT: and r0, r1, r12 +; F16-NATIVE-NEXT: bic r12, r2, r12 +; F16-NATIVE-NEXT: orr r0, r0, r12 +; F16-NATIVE-NEXT: bx lr +; +; THUMB1-LABEL: ct_bf16: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: movs r3, #1 +; THUMB1-NEXT: ands r3, r0 +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: lsls r4, r4, #31 +; THUMB1-NEXT: asrs r4, r4, #31 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r2 +; THUMB1-NEXT: ands r0, r4 +; THUMB1-NEXT: eors r0, r2 +; THUMB1-NEXT: pop {r4, pc} +; +; THUMB2-LABEL: ct_bf16: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: and r3, r0, #1 +; THUMB2-NEXT: rsb.w r12, r3, #0 +; THUMB2-NEXT: and.w r0, r1, r12 +; THUMB2-NEXT: bic.w r12, r2, r12 +; THUMB2-NEXT: orr.w r0, r0, r12 +; THUMB2-NEXT: bx lr +entry: + %sel = call bfloat @llvm.ct.select.bf16(i1 %cond, bfloat %a, bfloat %b) + ret bfloat %sel +} + +define <4 x half> @ct_v4f16(i1 %cond, <4 x half> %a, <4 x half> %b) { +; CT-LABEL: ct_v4f16: +; CT: @ %bb.0: @ %entry +; CT-NEXT: .save {r4, r5, r6, lr} +; CT-NEXT: push {r4, r5, r6, lr} +; CT-NEXT: ldrh r1, [sp, #20] +; CT-NEXT: pkhbt r2, r2, r3, lsl #16 +; CT-NEXT: ldrh r4, [sp, #16] +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: ldrh r12, [sp, #36] +; CT-NEXT: ldrh lr, [sp, #28] +; CT-NEXT: orr r1, r4, r1, lsl #16 +; CT-NEXT: ldrh r6, [sp, #24] +; CT-NEXT: ldrh r5, [sp, #32] +; CT-NEXT: vmov d17, r2, r1 +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: orr r6, r6, lr, lsl #16 +; CT-NEXT: orr r3, r5, r12, lsl #16 +; CT-NEXT: vdup.32 d19, r1 +; CT-NEXT: vmov d16, r6, r3 +; CT-NEXT: vand d18, d17, d19 +; CT-NEXT: vbic d19, d16, d19 +; CT-NEXT: vorr d18, d18, d19 +; CT-NEXT: vmov.u16 r0, d18[0] +; CT-NEXT: vmov.u16 r1, d18[1] +; CT-NEXT: vmov.u16 r2, d18[2] +; CT-NEXT: vmov.u16 r3, d18[3] +; CT-NEXT: pop {r4, r5, r6, pc} +; +; BFLOAT-F16-NATIVE-LABEL: ct_v4f16: +; BFLOAT-F16-NATIVE: @ %bb.0: @ %entry +; BFLOAT-F16-NATIVE-NEXT: .save {r4, r5, r6, lr} +; BFLOAT-F16-NATIVE-NEXT: push {r4, r5, r6, lr} +; BFLOAT-F16-NATIVE-NEXT: ldrh r1, [sp, #20] +; BFLOAT-F16-NATIVE-NEXT: pkhbt r2, r2, r3, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: ldrh r4, [sp, #16] +; BFLOAT-F16-NATIVE-NEXT: and r0, r0, #1 +; BFLOAT-F16-NATIVE-NEXT: ldrh r12, [sp, #36] +; BFLOAT-F16-NATIVE-NEXT: ldrh lr, [sp, #28] +; BFLOAT-F16-NATIVE-NEXT: orr r1, r4, r1, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: ldrh r6, [sp, #24] +; BFLOAT-F16-NATIVE-NEXT: ldrh r5, [sp, #32] +; BFLOAT-F16-NATIVE-NEXT: vmov d17, r2, r1 +; BFLOAT-F16-NATIVE-NEXT: rsb r1, r0, #0 +; BFLOAT-F16-NATIVE-NEXT: orr r6, r6, lr, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: orr r3, r5, r12, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: vdup.32 d19, r1 +; BFLOAT-F16-NATIVE-NEXT: vmov d16, r6, r3 +; BFLOAT-F16-NATIVE-NEXT: vand d18, d17, d19 +; BFLOAT-F16-NATIVE-NEXT: vbic d19, d16, d19 +; BFLOAT-F16-NATIVE-NEXT: vorr d18, d18, d19 +; BFLOAT-F16-NATIVE-NEXT: vmov.u16 r0, d18[0] +; BFLOAT-F16-NATIVE-NEXT: vmov.u16 r1, d18[1] +; BFLOAT-F16-NATIVE-NEXT: vmov.u16 r2, d18[2] +; BFLOAT-F16-NATIVE-NEXT: vmov.u16 r3, d18[3] +; BFLOAT-F16-NATIVE-NEXT: pop {r4, r5, r6, pc} +; +; F16-NATIVE-LABEL: ct_v4f16: +; F16-NATIVE: @ %bb.0: @ %entry +; F16-NATIVE-NEXT: .save {r4, r5, r6, lr} +; F16-NATIVE-NEXT: push {r4, r5, r6, lr} +; F16-NATIVE-NEXT: ldrh r1, [sp, #20] +; F16-NATIVE-NEXT: pkhbt r2, r2, r3, lsl #16 +; F16-NATIVE-NEXT: ldrh r4, [sp, #16] +; F16-NATIVE-NEXT: and r0, r0, #1 +; F16-NATIVE-NEXT: ldrh r12, [sp, #36] +; F16-NATIVE-NEXT: ldrh lr, [sp, #28] +; F16-NATIVE-NEXT: orr r1, r4, r1, lsl #16 +; F16-NATIVE-NEXT: ldrh r6, [sp, #24] +; F16-NATIVE-NEXT: ldrh r5, [sp, #32] +; F16-NATIVE-NEXT: vmov d17, r2, r1 +; F16-NATIVE-NEXT: rsb r1, r0, #0 +; F16-NATIVE-NEXT: orr r6, r6, lr, lsl #16 +; F16-NATIVE-NEXT: orr r3, r5, r12, lsl #16 +; F16-NATIVE-NEXT: vdup.32 d19, r1 +; F16-NATIVE-NEXT: vmov d16, r6, r3 +; F16-NATIVE-NEXT: vand d18, d17, d19 +; F16-NATIVE-NEXT: vbic d19, d16, d19 +; F16-NATIVE-NEXT: vorr d18, d18, d19 +; F16-NATIVE-NEXT: vmov.u16 r0, d18[0] +; F16-NATIVE-NEXT: vmov.u16 r1, d18[1] +; F16-NATIVE-NEXT: vmov.u16 r2, d18[2] +; F16-NATIVE-NEXT: vmov.u16 r3, d18[3] +; F16-NATIVE-NEXT: pop {r4, r5, r6, pc} +; +; THUMB1-LABEL: ct_v4f16: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #32] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ldr r2, [sp, #36] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: ldr r3, [sp, #40] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 +; THUMB1-NEXT: mov r2, r5 +; THUMB1-NEXT: eors r2, r3 +; THUMB1-NEXT: ands r2, r6 +; THUMB1-NEXT: eors r2, r3 +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: ldr r6, [sp, #28] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: eors r3, r5 +; THUMB1-NEXT: ands r3, r7 +; THUMB1-NEXT: eors r3, r5 +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v4f16: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldrh.w r1, [sp, #24] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r4, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldrh.w r2, [sp, #28] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r5, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: ldrh.w r3, [sp, #16] +; THUMB2-NEXT: ldrh.w lr, [sp, #32] +; THUMB2-NEXT: and.w r2, r3, r4 +; THUMB2-NEXT: bic.w r4, lr, r4 +; THUMB2-NEXT: ldrh.w lr, [sp, #36] +; THUMB2-NEXT: orrs r2, r4 +; THUMB2-NEXT: ldrh.w r4, [sp, #20] +; THUMB2-NEXT: and.w r3, r4, r5 +; THUMB2-NEXT: bic.w r5, lr, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: pop {r4, r5, r7, pc} +entry: + %sel = call <4 x half> @llvm.ct.select.v4f16(i1 %cond, <4 x half> %a, <4 x half> %b) + ret <4 x half> %sel +} + +define <4 x bfloat> @ct_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) { +; CT-LABEL: ct_v4bf16: +; CT: @ %bb.0: @ %entry +; CT-NEXT: .save {r4, r5, r6, lr} +; CT-NEXT: push {r4, r5, r6, lr} +; CT-NEXT: ldrh r1, [sp, #20] +; CT-NEXT: pkhbt r2, r2, r3, lsl #16 +; CT-NEXT: ldrh r4, [sp, #16] +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: ldrh r12, [sp, #36] +; CT-NEXT: ldrh lr, [sp, #28] +; CT-NEXT: orr r1, r4, r1, lsl #16 +; CT-NEXT: ldrh r6, [sp, #24] +; CT-NEXT: ldrh r5, [sp, #32] +; CT-NEXT: vmov d17, r2, r1 +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: orr r6, r6, lr, lsl #16 +; CT-NEXT: orr r3, r5, r12, lsl #16 +; CT-NEXT: vdup.32 d19, r1 +; CT-NEXT: vmov d16, r6, r3 +; CT-NEXT: vand d18, d17, d19 +; CT-NEXT: vbic d19, d16, d19 +; CT-NEXT: vorr d18, d18, d19 +; CT-NEXT: vmov.u16 r0, d18[0] +; CT-NEXT: vmov.u16 r1, d18[1] +; CT-NEXT: vmov.u16 r2, d18[2] +; CT-NEXT: vmov.u16 r3, d18[3] +; CT-NEXT: pop {r4, r5, r6, pc} +; +; BFLOAT-F16-NATIVE-LABEL: ct_v4bf16: +; BFLOAT-F16-NATIVE: @ %bb.0: @ %entry +; BFLOAT-F16-NATIVE-NEXT: and r0, r0, #1 +; BFLOAT-F16-NATIVE-NEXT: vldr d16, [sp] +; BFLOAT-F16-NATIVE-NEXT: rsb r1, r0, #0 +; BFLOAT-F16-NATIVE-NEXT: vmov d17, r2, r3 +; BFLOAT-F16-NATIVE-NEXT: vdup.32 d19, r1 +; BFLOAT-F16-NATIVE-NEXT: vand d18, d17, d19 +; BFLOAT-F16-NATIVE-NEXT: vbic d19, d16, d19 +; BFLOAT-F16-NATIVE-NEXT: vorr d18, d18, d19 +; BFLOAT-F16-NATIVE-NEXT: vmov r0, r1, d18 +; BFLOAT-F16-NATIVE-NEXT: bx lr +; +; F16-NATIVE-LABEL: ct_v4bf16: +; F16-NATIVE: @ %bb.0: @ %entry +; F16-NATIVE-NEXT: .save {r4, r5, r6, lr} +; F16-NATIVE-NEXT: push {r4, r5, r6, lr} +; F16-NATIVE-NEXT: ldrh r1, [sp, #20] +; F16-NATIVE-NEXT: pkhbt r2, r2, r3, lsl #16 +; F16-NATIVE-NEXT: ldrh r4, [sp, #16] +; F16-NATIVE-NEXT: and r0, r0, #1 +; F16-NATIVE-NEXT: ldrh r12, [sp, #36] +; F16-NATIVE-NEXT: ldrh lr, [sp, #28] +; F16-NATIVE-NEXT: orr r1, r4, r1, lsl #16 +; F16-NATIVE-NEXT: ldrh r6, [sp, #24] +; F16-NATIVE-NEXT: ldrh r5, [sp, #32] +; F16-NATIVE-NEXT: vmov d17, r2, r1 +; F16-NATIVE-NEXT: rsb r1, r0, #0 +; F16-NATIVE-NEXT: orr r6, r6, lr, lsl #16 +; F16-NATIVE-NEXT: orr r3, r5, r12, lsl #16 +; F16-NATIVE-NEXT: vdup.32 d19, r1 +; F16-NATIVE-NEXT: vmov d16, r6, r3 +; F16-NATIVE-NEXT: vand d18, d17, d19 +; F16-NATIVE-NEXT: vbic d19, d16, d19 +; F16-NATIVE-NEXT: vorr d18, d18, d19 +; F16-NATIVE-NEXT: vmov.u16 r0, d18[0] +; F16-NATIVE-NEXT: vmov.u16 r1, d18[1] +; F16-NATIVE-NEXT: vmov.u16 r2, d18[2] +; F16-NATIVE-NEXT: vmov.u16 r3, d18[3] +; F16-NATIVE-NEXT: pop {r4, r5, r6, pc} +; +; THUMB1-LABEL: ct_v4bf16: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #32] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ldr r2, [sp, #36] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: ldr r3, [sp, #40] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 +; THUMB1-NEXT: mov r2, r5 +; THUMB1-NEXT: eors r2, r3 +; THUMB1-NEXT: ands r2, r6 +; THUMB1-NEXT: eors r2, r3 +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: ldr r6, [sp, #28] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: eors r3, r5 +; THUMB1-NEXT: ands r3, r7 +; THUMB1-NEXT: eors r3, r5 +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v4bf16: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldrh.w r1, [sp, #24] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r4, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldrh.w r2, [sp, #28] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r5, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: ldrh.w r3, [sp, #16] +; THUMB2-NEXT: ldrh.w lr, [sp, #32] +; THUMB2-NEXT: and.w r2, r3, r4 +; THUMB2-NEXT: bic.w r4, lr, r4 +; THUMB2-NEXT: ldrh.w lr, [sp, #36] +; THUMB2-NEXT: orrs r2, r4 +; THUMB2-NEXT: ldrh.w r4, [sp, #20] +; THUMB2-NEXT: and.w r3, r4, r5 +; THUMB2-NEXT: bic.w r5, lr, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: pop {r4, r5, r7, pc} +entry: + %sel = call <4 x bfloat> @llvm.ct.select.v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) + ret <4 x bfloat> %sel +} + +define <8 x half> @ct_v8f16(i1 %cond, <8 x half> %a, <8 x half> %b) { +; CT-LABEL: ct_v8f16: +; CT: @ %bb.0: @ %entry +; CT-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CT-NEXT: push {r4, r5, r6, r7, r8, lr} +; CT-NEXT: ldrh r12, [sp, #36] +; CT-NEXT: pkhbt r2, r2, r3, lsl #16 +; CT-NEXT: ldrh r7, [sp, #32] +; CT-NEXT: and r1, r1, #1 +; CT-NEXT: ldrh r3, [sp, #52] +; CT-NEXT: vmov.32 d16[0], r2 +; CT-NEXT: ldrh r2, [sp, #48] +; CT-NEXT: orr r7, r7, r12, lsl #16 +; CT-NEXT: ldrh r5, [sp, #68] +; CT-NEXT: orr r2, r2, r3, lsl #16 +; CT-NEXT: vmov.32 d17[0], r7 +; CT-NEXT: ldrh r7, [sp, #64] +; CT-NEXT: ldrh r3, [sp, #28] +; CT-NEXT: vmov.32 d18[0], r2 +; CT-NEXT: ldrh r2, [sp, #24] +; CT-NEXT: orr r7, r7, r5, lsl #16 +; CT-NEXT: ldrh r5, [sp, #76] +; CT-NEXT: vmov.32 d19[0], r7 +; CT-NEXT: orr r2, r2, r3, lsl #16 +; CT-NEXT: ldrh r7, [sp, #72] +; CT-NEXT: ldrh lr, [sp, #60] +; CT-NEXT: vmov.32 d16[1], r2 +; CT-NEXT: orr r2, r7, r5, lsl #16 +; CT-NEXT: ldrh r4, [sp, #56] +; CT-NEXT: ldrh r8, [sp, #44] +; CT-NEXT: vmov.32 d19[1], r2 +; CT-NEXT: orr r2, r4, lr, lsl #16 +; CT-NEXT: ldrh r6, [sp, #40] +; CT-NEXT: vmov.32 d18[1], r2 +; CT-NEXT: orr r2, r6, r8, lsl #16 +; CT-NEXT: vmov.32 d17[1], r2 +; CT-NEXT: rsb r2, r1, #0 +; CT-NEXT: vdup.32 q11, r2 +; CT-NEXT: vand q10, q8, q11 +; CT-NEXT: vbic q11, q9, q11 +; CT-NEXT: vorr q10, q10, q11 +; CT-NEXT: vst1.64 {d20, d21}, [r0:128] +; CT-NEXT: pop {r4, r5, r6, r7, r8, pc} +; +; BFLOAT-F16-NATIVE-LABEL: ct_v8f16: +; BFLOAT-F16-NATIVE: @ %bb.0: @ %entry +; BFLOAT-F16-NATIVE-NEXT: .save {r4, r5, r6, r7, r8, lr} +; BFLOAT-F16-NATIVE-NEXT: push {r4, r5, r6, r7, r8, lr} +; BFLOAT-F16-NATIVE-NEXT: ldrh r12, [sp, #36] +; BFLOAT-F16-NATIVE-NEXT: pkhbt r2, r2, r3, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: ldrh r7, [sp, #32] +; BFLOAT-F16-NATIVE-NEXT: and r1, r1, #1 +; BFLOAT-F16-NATIVE-NEXT: ldrh r3, [sp, #52] +; BFLOAT-F16-NATIVE-NEXT: vmov.32 d16[0], r2 +; BFLOAT-F16-NATIVE-NEXT: ldrh r2, [sp, #48] +; BFLOAT-F16-NATIVE-NEXT: orr r7, r7, r12, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: ldrh r5, [sp, #68] +; BFLOAT-F16-NATIVE-NEXT: orr r2, r2, r3, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: vmov.32 d17[0], r7 +; BFLOAT-F16-NATIVE-NEXT: ldrh r7, [sp, #64] +; BFLOAT-F16-NATIVE-NEXT: ldrh r3, [sp, #28] +; BFLOAT-F16-NATIVE-NEXT: vmov.32 d18[0], r2 +; BFLOAT-F16-NATIVE-NEXT: ldrh r2, [sp, #24] +; BFLOAT-F16-NATIVE-NEXT: orr r7, r7, r5, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: ldrh r5, [sp, #76] +; BFLOAT-F16-NATIVE-NEXT: vmov.32 d19[0], r7 +; BFLOAT-F16-NATIVE-NEXT: orr r2, r2, r3, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: ldrh r7, [sp, #72] +; BFLOAT-F16-NATIVE-NEXT: ldrh lr, [sp, #60] +; BFLOAT-F16-NATIVE-NEXT: vmov.32 d16[1], r2 +; BFLOAT-F16-NATIVE-NEXT: orr r2, r7, r5, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: ldrh r4, [sp, #56] +; BFLOAT-F16-NATIVE-NEXT: ldrh r8, [sp, #44] +; BFLOAT-F16-NATIVE-NEXT: vmov.32 d19[1], r2 +; BFLOAT-F16-NATIVE-NEXT: orr r2, r4, lr, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: ldrh r6, [sp, #40] +; BFLOAT-F16-NATIVE-NEXT: vmov.32 d18[1], r2 +; BFLOAT-F16-NATIVE-NEXT: orr r2, r6, r8, lsl #16 +; BFLOAT-F16-NATIVE-NEXT: vmov.32 d17[1], r2 +; BFLOAT-F16-NATIVE-NEXT: rsb r2, r1, #0 +; BFLOAT-F16-NATIVE-NEXT: vdup.32 q11, r2 +; BFLOAT-F16-NATIVE-NEXT: vand q10, q8, q11 +; BFLOAT-F16-NATIVE-NEXT: vbic q11, q9, q11 +; BFLOAT-F16-NATIVE-NEXT: vorr q10, q10, q11 +; BFLOAT-F16-NATIVE-NEXT: vst1.64 {d20, d21}, [r0:128] +; BFLOAT-F16-NATIVE-NEXT: pop {r4, r5, r6, r7, r8, pc} +; +; F16-NATIVE-LABEL: ct_v8f16: +; F16-NATIVE: @ %bb.0: @ %entry +; F16-NATIVE-NEXT: .save {r4, r5, r6, r7, r8, lr} +; F16-NATIVE-NEXT: push {r4, r5, r6, r7, r8, lr} +; F16-NATIVE-NEXT: ldrh r12, [sp, #36] +; F16-NATIVE-NEXT: pkhbt r2, r2, r3, lsl #16 +; F16-NATIVE-NEXT: ldrh r7, [sp, #32] +; F16-NATIVE-NEXT: and r1, r1, #1 +; F16-NATIVE-NEXT: ldrh r3, [sp, #52] +; F16-NATIVE-NEXT: vmov.32 d16[0], r2 +; F16-NATIVE-NEXT: ldrh r2, [sp, #48] +; F16-NATIVE-NEXT: orr r7, r7, r12, lsl #16 +; F16-NATIVE-NEXT: ldrh r5, [sp, #68] +; F16-NATIVE-NEXT: orr r2, r2, r3, lsl #16 +; F16-NATIVE-NEXT: vmov.32 d17[0], r7 +; F16-NATIVE-NEXT: ldrh r7, [sp, #64] +; F16-NATIVE-NEXT: ldrh r3, [sp, #28] +; F16-NATIVE-NEXT: vmov.32 d18[0], r2 +; F16-NATIVE-NEXT: ldrh r2, [sp, #24] +; F16-NATIVE-NEXT: orr r7, r7, r5, lsl #16 +; F16-NATIVE-NEXT: ldrh r5, [sp, #76] +; F16-NATIVE-NEXT: vmov.32 d19[0], r7 +; F16-NATIVE-NEXT: orr r2, r2, r3, lsl #16 +; F16-NATIVE-NEXT: ldrh r7, [sp, #72] +; F16-NATIVE-NEXT: ldrh lr, [sp, #60] +; F16-NATIVE-NEXT: vmov.32 d16[1], r2 +; F16-NATIVE-NEXT: orr r2, r7, r5, lsl #16 +; F16-NATIVE-NEXT: ldrh r4, [sp, #56] +; F16-NATIVE-NEXT: ldrh r8, [sp, #44] +; F16-NATIVE-NEXT: vmov.32 d19[1], r2 +; F16-NATIVE-NEXT: orr r2, r4, lr, lsl #16 +; F16-NATIVE-NEXT: ldrh r6, [sp, #40] +; F16-NATIVE-NEXT: vmov.32 d18[1], r2 +; F16-NATIVE-NEXT: orr r2, r6, r8, lsl #16 +; F16-NATIVE-NEXT: vmov.32 d17[1], r2 +; F16-NATIVE-NEXT: rsb r2, r1, #0 +; F16-NATIVE-NEXT: vdup.32 q11, r2 +; F16-NATIVE-NEXT: vand q10, q8, q11 +; F16-NATIVE-NEXT: vbic q11, q9, q11 +; F16-NATIVE-NEXT: vorr q10, q10, q11 +; F16-NATIVE-NEXT: vst1.64 {d20, d21}, [r0:128] +; F16-NATIVE-NEXT: pop {r4, r5, r6, r7, r8, pc} +; +; THUMB1-LABEL: ct_v8f16: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r1 +; THUMB1-NEXT: ldr r1, [sp, #76] +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strh r6, [r0, #14] +; THUMB1-NEXT: ldr r1, [sp, #72] +; THUMB1-NEXT: ldr r5, [sp, #40] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strh r6, [r0, #12] +; THUMB1-NEXT: ldr r1, [sp, #68] +; THUMB1-NEXT: ldr r5, [sp, #36] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strh r6, [r0, #10] +; THUMB1-NEXT: ldr r1, [sp, #64] +; THUMB1-NEXT: ldr r5, [sp, #32] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strh r6, [r0, #8] +; THUMB1-NEXT: ldr r1, [sp, #60] +; THUMB1-NEXT: ldr r5, [sp, #28] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strh r6, [r0, #6] +; THUMB1-NEXT: ldr r1, [sp, #56] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strh r6, [r0, #4] +; THUMB1-NEXT: ldr r1, [sp, #52] +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 +; THUMB1-NEXT: mov r5, r3 +; THUMB1-NEXT: eors r5, r1 +; THUMB1-NEXT: ands r5, r6 +; THUMB1-NEXT: eors r5, r1 +; THUMB1-NEXT: strh r5, [r0, #2] +; THUMB1-NEXT: ldr r1, [sp, #48] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: eors r3, r1 +; THUMB1-NEXT: ands r3, r5 +; THUMB1-NEXT: eors r3, r1 +; THUMB1-NEXT: strh r3, [r0] +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v8f16: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and lr, r1, #1 +; THUMB2-NEXT: ldrh.w r12, [sp, #68] +; THUMB2-NEXT: ldrh.w r1, [sp, #36] +; THUMB2-NEXT: rsb.w r5, lr, #0 +; THUMB2-NEXT: and.w r4, r1, r5 +; THUMB2-NEXT: bic.w r5, r12, r5 +; THUMB2-NEXT: orrs r4, r5 +; THUMB2-NEXT: ldrh.w r12, [sp, #64] +; THUMB2-NEXT: ldrh.w r5, [sp, #32] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #14] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #60] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #28] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #12] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #56] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #24] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #10] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #52] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #20] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #8] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #48] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #16] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #6] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r1, [sp, #44] +; THUMB2-NEXT: strh r4, [r0, #4] +; THUMB2-NEXT: rsb.w r4, lr, #0 +; THUMB2-NEXT: and.w r5, r3, r4 +; THUMB2-NEXT: bic.w r4, r1, r4 +; THUMB2-NEXT: orrs r5, r4 +; THUMB2-NEXT: ldrh.w r1, [sp, #40] +; THUMB2-NEXT: strh r5, [r0, #2] +; THUMB2-NEXT: rsb.w r5, lr, #0 +; THUMB2-NEXT: and.w r3, r2, r5 +; THUMB2-NEXT: bic.w r5, r1, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: strh r3, [r0] +; THUMB2-NEXT: pop {r4, r5, r7, pc} +entry: + %sel = call <8 x half> @llvm.ct.select.v8f16(i1 %cond, <8 x half> %a, <8 x half> %b) + ret <8 x half> %sel +} + +define <8 x bfloat> @ct_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) { +; CT-LABEL: ct_v8bf16: +; CT: @ %bb.0: @ %entry +; CT-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CT-NEXT: push {r4, r5, r6, r7, r8, lr} +; CT-NEXT: ldrh r12, [sp, #36] +; CT-NEXT: pkhbt r2, r2, r3, lsl #16 +; CT-NEXT: ldrh r7, [sp, #32] +; CT-NEXT: and r1, r1, #1 +; CT-NEXT: ldrh r3, [sp, #52] +; CT-NEXT: vmov.32 d16[0], r2 +; CT-NEXT: ldrh r2, [sp, #48] +; CT-NEXT: orr r7, r7, r12, lsl #16 +; CT-NEXT: ldrh r5, [sp, #68] +; CT-NEXT: orr r2, r2, r3, lsl #16 +; CT-NEXT: vmov.32 d17[0], r7 +; CT-NEXT: ldrh r7, [sp, #64] +; CT-NEXT: ldrh r3, [sp, #28] +; CT-NEXT: vmov.32 d18[0], r2 +; CT-NEXT: ldrh r2, [sp, #24] +; CT-NEXT: orr r7, r7, r5, lsl #16 +; CT-NEXT: ldrh r5, [sp, #76] +; CT-NEXT: vmov.32 d19[0], r7 +; CT-NEXT: orr r2, r2, r3, lsl #16 +; CT-NEXT: ldrh r7, [sp, #72] +; CT-NEXT: ldrh lr, [sp, #60] +; CT-NEXT: vmov.32 d16[1], r2 +; CT-NEXT: orr r2, r7, r5, lsl #16 +; CT-NEXT: ldrh r4, [sp, #56] +; CT-NEXT: ldrh r8, [sp, #44] +; CT-NEXT: vmov.32 d19[1], r2 +; CT-NEXT: orr r2, r4, lr, lsl #16 +; CT-NEXT: ldrh r6, [sp, #40] +; CT-NEXT: vmov.32 d18[1], r2 +; CT-NEXT: orr r2, r6, r8, lsl #16 +; CT-NEXT: vmov.32 d17[1], r2 +; CT-NEXT: rsb r2, r1, #0 +; CT-NEXT: vdup.32 q11, r2 +; CT-NEXT: vand q10, q8, q11 +; CT-NEXT: vbic q11, q9, q11 +; CT-NEXT: vorr q10, q10, q11 +; CT-NEXT: vst1.64 {d20, d21}, [r0:128] +; CT-NEXT: pop {r4, r5, r6, r7, r8, pc} +; +; BFLOAT-F16-NATIVE-LABEL: ct_v8bf16: +; BFLOAT-F16-NATIVE: @ %bb.0: @ %entry +; BFLOAT-F16-NATIVE-NEXT: add r1, sp, #8 +; BFLOAT-F16-NATIVE-NEXT: and r0, r0, #1 +; BFLOAT-F16-NATIVE-NEXT: vld1.64 {d18, d19}, [r1] +; BFLOAT-F16-NATIVE-NEXT: rsb r1, r0, #0 +; BFLOAT-F16-NATIVE-NEXT: vldr d17, [sp] +; BFLOAT-F16-NATIVE-NEXT: vmov d16, r2, r3 +; BFLOAT-F16-NATIVE-NEXT: vdup.32 q11, r1 +; BFLOAT-F16-NATIVE-NEXT: vand q10, q8, q11 +; BFLOAT-F16-NATIVE-NEXT: vbic q11, q9, q11 +; BFLOAT-F16-NATIVE-NEXT: vorr q10, q10, q11 +; BFLOAT-F16-NATIVE-NEXT: vmov r0, r1, d20 +; BFLOAT-F16-NATIVE-NEXT: vmov r2, r3, d21 +; BFLOAT-F16-NATIVE-NEXT: bx lr +; +; F16-NATIVE-LABEL: ct_v8bf16: +; F16-NATIVE: @ %bb.0: @ %entry +; F16-NATIVE-NEXT: .save {r4, r5, r6, r7, r8, lr} +; F16-NATIVE-NEXT: push {r4, r5, r6, r7, r8, lr} +; F16-NATIVE-NEXT: ldrh r12, [sp, #36] +; F16-NATIVE-NEXT: pkhbt r2, r2, r3, lsl #16 +; F16-NATIVE-NEXT: ldrh r7, [sp, #32] +; F16-NATIVE-NEXT: and r1, r1, #1 +; F16-NATIVE-NEXT: ldrh r3, [sp, #52] +; F16-NATIVE-NEXT: vmov.32 d16[0], r2 +; F16-NATIVE-NEXT: ldrh r2, [sp, #48] +; F16-NATIVE-NEXT: orr r7, r7, r12, lsl #16 +; F16-NATIVE-NEXT: ldrh r5, [sp, #68] +; F16-NATIVE-NEXT: orr r2, r2, r3, lsl #16 +; F16-NATIVE-NEXT: vmov.32 d17[0], r7 +; F16-NATIVE-NEXT: ldrh r7, [sp, #64] +; F16-NATIVE-NEXT: ldrh r3, [sp, #28] +; F16-NATIVE-NEXT: vmov.32 d18[0], r2 +; F16-NATIVE-NEXT: ldrh r2, [sp, #24] +; F16-NATIVE-NEXT: orr r7, r7, r5, lsl #16 +; F16-NATIVE-NEXT: ldrh r5, [sp, #76] +; F16-NATIVE-NEXT: vmov.32 d19[0], r7 +; F16-NATIVE-NEXT: orr r2, r2, r3, lsl #16 +; F16-NATIVE-NEXT: ldrh r7, [sp, #72] +; F16-NATIVE-NEXT: ldrh lr, [sp, #60] +; F16-NATIVE-NEXT: vmov.32 d16[1], r2 +; F16-NATIVE-NEXT: orr r2, r7, r5, lsl #16 +; F16-NATIVE-NEXT: ldrh r4, [sp, #56] +; F16-NATIVE-NEXT: ldrh r8, [sp, #44] +; F16-NATIVE-NEXT: vmov.32 d19[1], r2 +; F16-NATIVE-NEXT: orr r2, r4, lr, lsl #16 +; F16-NATIVE-NEXT: ldrh r6, [sp, #40] +; F16-NATIVE-NEXT: vmov.32 d18[1], r2 +; F16-NATIVE-NEXT: orr r2, r6, r8, lsl #16 +; F16-NATIVE-NEXT: vmov.32 d17[1], r2 +; F16-NATIVE-NEXT: rsb r2, r1, #0 +; F16-NATIVE-NEXT: vdup.32 q11, r2 +; F16-NATIVE-NEXT: vand q10, q8, q11 +; F16-NATIVE-NEXT: vbic q11, q9, q11 +; F16-NATIVE-NEXT: vorr q10, q10, q11 +; F16-NATIVE-NEXT: vst1.64 {d20, d21}, [r0:128] +; F16-NATIVE-NEXT: pop {r4, r5, r6, r7, r8, pc} +; +; THUMB1-LABEL: ct_v8bf16: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r1 +; THUMB1-NEXT: ldr r1, [sp, #76] +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strh r6, [r0, #14] +; THUMB1-NEXT: ldr r1, [sp, #72] +; THUMB1-NEXT: ldr r5, [sp, #40] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strh r6, [r0, #12] +; THUMB1-NEXT: ldr r1, [sp, #68] +; THUMB1-NEXT: ldr r5, [sp, #36] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strh r6, [r0, #10] +; THUMB1-NEXT: ldr r1, [sp, #64] +; THUMB1-NEXT: ldr r5, [sp, #32] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strh r6, [r0, #8] +; THUMB1-NEXT: ldr r1, [sp, #60] +; THUMB1-NEXT: ldr r5, [sp, #28] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strh r6, [r0, #6] +; THUMB1-NEXT: ldr r1, [sp, #56] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strh r6, [r0, #4] +; THUMB1-NEXT: ldr r1, [sp, #52] +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 +; THUMB1-NEXT: mov r5, r3 +; THUMB1-NEXT: eors r5, r1 +; THUMB1-NEXT: ands r5, r6 +; THUMB1-NEXT: eors r5, r1 +; THUMB1-NEXT: strh r5, [r0, #2] +; THUMB1-NEXT: ldr r1, [sp, #48] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: eors r3, r1 +; THUMB1-NEXT: ands r3, r5 +; THUMB1-NEXT: eors r3, r1 +; THUMB1-NEXT: strh r3, [r0] +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v8bf16: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and lr, r1, #1 +; THUMB2-NEXT: ldrh.w r12, [sp, #68] +; THUMB2-NEXT: ldrh.w r1, [sp, #36] +; THUMB2-NEXT: rsb.w r5, lr, #0 +; THUMB2-NEXT: and.w r4, r1, r5 +; THUMB2-NEXT: bic.w r5, r12, r5 +; THUMB2-NEXT: orrs r4, r5 +; THUMB2-NEXT: ldrh.w r12, [sp, #64] +; THUMB2-NEXT: ldrh.w r5, [sp, #32] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #14] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #60] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #28] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #12] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #56] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #24] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #10] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #52] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #20] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #8] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #48] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #16] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #6] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r1, [sp, #44] +; THUMB2-NEXT: strh r4, [r0, #4] +; THUMB2-NEXT: rsb.w r4, lr, #0 +; THUMB2-NEXT: and.w r5, r3, r4 +; THUMB2-NEXT: bic.w r4, r1, r4 +; THUMB2-NEXT: orrs r5, r4 +; THUMB2-NEXT: ldrh.w r1, [sp, #40] +; THUMB2-NEXT: strh r5, [r0, #2] +; THUMB2-NEXT: rsb.w r5, lr, #0 +; THUMB2-NEXT: and.w r3, r2, r5 +; THUMB2-NEXT: bic.w r5, r1, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: strh r3, [r0] +; THUMB2-NEXT: pop {r4, r5, r7, pc} +entry: + %sel = call <8 x bfloat> @llvm.ct.select.v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) + ret <8 x bfloat> %sel +} diff --git a/llvm/test/CodeGen/ARM/ctselect-vector.ll b/llvm/test/CodeGen/ARM/ctselect-vector.ll new file mode 100644 index 0000000000000..c410f78b24c0e --- /dev/null +++ b/llvm/test/CodeGen/ARM/ctselect-vector.ll @@ -0,0 +1,2179 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=armv7-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=CT %s +; RUN: llc < %s -mtriple=armv6 -verify-machineinstrs | FileCheck --check-prefix=DEFAULT %s +; RUN: llc < %s -mtriple=thumbv6m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB1 %s +; RUN: llc < %s -mtriple=thumbv7m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB2 %s + +define <8 x i8> @ct_v8i8(i1 %cond, <8 x i8> %a, <8 x i8> %b) { +; CT-LABEL: ct_v8i8: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vldr d16, [sp] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vmov d17, r2, r3 +; CT-NEXT: vdup.32 d19, r1 +; CT-NEXT: vand d18, d17, d19 +; CT-NEXT: vbic d19, d16, d19 +; CT-NEXT: vorr d18, d18, d19 +; CT-NEXT: vmov r0, r1, d18 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v8i8: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r4, r5, r11, lr} +; DEFAULT-NEXT: and lr, r1, #1 +; DEFAULT-NEXT: ldrb r12, [sp, #68] +; DEFAULT-NEXT: ldrb r1, [sp, #36] +; DEFAULT-NEXT: rsb r5, lr, #0 +; DEFAULT-NEXT: and r4, r1, r5 +; DEFAULT-NEXT: bic r5, r12, r5 +; DEFAULT-NEXT: orr r4, r4, r5 +; DEFAULT-NEXT: ldrb r12, [sp, #64] +; DEFAULT-NEXT: ldrb r5, [sp, #32] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #7] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #60] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #28] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #6] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #56] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #24] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #5] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #52] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #20] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #4] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #48] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #16] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #3] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r1, [sp, #44] +; DEFAULT-NEXT: strb r4, [r0, #2] +; DEFAULT-NEXT: rsb r4, lr, #0 +; DEFAULT-NEXT: and r5, r3, r4 +; DEFAULT-NEXT: bic r4, r1, r4 +; DEFAULT-NEXT: orr r5, r5, r4 +; DEFAULT-NEXT: ldrb r1, [sp, #40] +; DEFAULT-NEXT: strb r5, [r0, #1] +; DEFAULT-NEXT: rsb r5, lr, #0 +; DEFAULT-NEXT: and r3, r2, r5 +; DEFAULT-NEXT: bic r5, r1, r5 +; DEFAULT-NEXT: orr r3, r3, r5 +; DEFAULT-NEXT: strb r3, [r0] +; DEFAULT-NEXT: pop {r4, r5, r11, pc} +; +; THUMB1-LABEL: ct_v8i8: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r1 +; THUMB1-NEXT: ldr r1, [sp, #76] +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strb r6, [r0, #7] +; THUMB1-NEXT: ldr r1, [sp, #72] +; THUMB1-NEXT: ldr r5, [sp, #40] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strb r6, [r0, #6] +; THUMB1-NEXT: ldr r1, [sp, #68] +; THUMB1-NEXT: ldr r5, [sp, #36] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strb r6, [r0, #5] +; THUMB1-NEXT: ldr r1, [sp, #64] +; THUMB1-NEXT: ldr r5, [sp, #32] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strb r6, [r0, #4] +; THUMB1-NEXT: ldr r1, [sp, #60] +; THUMB1-NEXT: ldr r5, [sp, #28] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strb r6, [r0, #3] +; THUMB1-NEXT: ldr r1, [sp, #56] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strb r6, [r0, #2] +; THUMB1-NEXT: ldr r1, [sp, #52] +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 +; THUMB1-NEXT: mov r5, r3 +; THUMB1-NEXT: eors r5, r1 +; THUMB1-NEXT: ands r5, r6 +; THUMB1-NEXT: eors r5, r1 +; THUMB1-NEXT: strb r5, [r0, #1] +; THUMB1-NEXT: ldr r1, [sp, #48] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: eors r3, r1 +; THUMB1-NEXT: ands r3, r5 +; THUMB1-NEXT: eors r3, r1 +; THUMB1-NEXT: strb r3, [r0] +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v8i8: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and lr, r1, #1 +; THUMB2-NEXT: ldrb.w r12, [sp, #68] +; THUMB2-NEXT: ldrb.w r1, [sp, #36] +; THUMB2-NEXT: rsb.w r5, lr, #0 +; THUMB2-NEXT: and.w r4, r1, r5 +; THUMB2-NEXT: bic.w r5, r12, r5 +; THUMB2-NEXT: orrs r4, r5 +; THUMB2-NEXT: ldrb.w r12, [sp, #64] +; THUMB2-NEXT: ldrb.w r5, [sp, #32] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #7] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #60] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #28] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #6] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #56] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #24] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #5] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #52] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #20] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #4] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #48] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #16] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #3] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r1, [sp, #44] +; THUMB2-NEXT: strb r4, [r0, #2] +; THUMB2-NEXT: rsb.w r4, lr, #0 +; THUMB2-NEXT: and.w r5, r3, r4 +; THUMB2-NEXT: bic.w r4, r1, r4 +; THUMB2-NEXT: orrs r5, r4 +; THUMB2-NEXT: ldrb.w r1, [sp, #40] +; THUMB2-NEXT: strb r5, [r0, #1] +; THUMB2-NEXT: rsb.w r5, lr, #0 +; THUMB2-NEXT: and.w r3, r2, r5 +; THUMB2-NEXT: bic.w r5, r1, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: strb r3, [r0] +; THUMB2-NEXT: pop {r4, r5, r7, pc} +entry: + %sel = call <8 x i8> @llvm.ct.select.v8i8(i1 %cond, <8 x i8> %a, <8 x i8> %b) + ret <8 x i8> %sel +} + +define <4 x i16> @ct_v4i16(i1 %cond, <4 x i16> %a, <4 x i16> %b) { +; CT-LABEL: ct_v4i16: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vldr d16, [sp] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vmov d17, r2, r3 +; CT-NEXT: vdup.32 d19, r1 +; CT-NEXT: vand d18, d17, d19 +; CT-NEXT: vbic d19, d16, d19 +; CT-NEXT: vorr d18, d18, d19 +; CT-NEXT: vmov r0, r1, d18 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v4i16: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r4, r5, r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: ldrh r1, [sp, #24] +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: rsb r4, r12, #0 +; DEFAULT-NEXT: and r0, r2, lr +; DEFAULT-NEXT: bic lr, r1, lr +; DEFAULT-NEXT: ldrh r2, [sp, #28] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: rsb r5, r12, #0 +; DEFAULT-NEXT: and r1, r3, lr +; DEFAULT-NEXT: bic lr, r2, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: ldrh r3, [sp, #16] +; DEFAULT-NEXT: ldrh lr, [sp, #32] +; DEFAULT-NEXT: and r2, r3, r4 +; DEFAULT-NEXT: bic r4, lr, r4 +; DEFAULT-NEXT: ldrh lr, [sp, #36] +; DEFAULT-NEXT: orr r2, r2, r4 +; DEFAULT-NEXT: ldrh r4, [sp, #20] +; DEFAULT-NEXT: and r3, r4, r5 +; DEFAULT-NEXT: bic r5, lr, r5 +; DEFAULT-NEXT: orr r3, r3, r5 +; DEFAULT-NEXT: pop {r4, r5, r11, pc} +; +; THUMB1-LABEL: ct_v4i16: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #32] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ldr r2, [sp, #36] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: ldr r3, [sp, #40] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 +; THUMB1-NEXT: mov r2, r5 +; THUMB1-NEXT: eors r2, r3 +; THUMB1-NEXT: ands r2, r6 +; THUMB1-NEXT: eors r2, r3 +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: ldr r6, [sp, #28] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: eors r3, r5 +; THUMB1-NEXT: ands r3, r7 +; THUMB1-NEXT: eors r3, r5 +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v4i16: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldrh.w r1, [sp, #24] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r4, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldrh.w r2, [sp, #28] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r5, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: ldrh.w r3, [sp, #16] +; THUMB2-NEXT: ldrh.w lr, [sp, #32] +; THUMB2-NEXT: and.w r2, r3, r4 +; THUMB2-NEXT: bic.w r4, lr, r4 +; THUMB2-NEXT: ldrh.w lr, [sp, #36] +; THUMB2-NEXT: orrs r2, r4 +; THUMB2-NEXT: ldrh.w r4, [sp, #20] +; THUMB2-NEXT: and.w r3, r4, r5 +; THUMB2-NEXT: bic.w r5, lr, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: pop {r4, r5, r7, pc} +entry: + %sel = call <4 x i16> @llvm.ct.select.v4i16(i1 %cond, <4 x i16> %a, <4 x i16> %b) + ret <4 x i16> %sel +} + +define <2 x i32> @ct_v2i32(i1 %cond, <2 x i32> %a, <2 x i32> %b) { +; CT-LABEL: ct_v2i32: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vldr d16, [sp] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vmov d17, r2, r3 +; CT-NEXT: vdup.32 d19, r1 +; CT-NEXT: vand d18, d17, d19 +; CT-NEXT: vbic d19, d16, d19 +; CT-NEXT: vorr d18, d18, d19 +; CT-NEXT: vmov r0, r1, d18 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v2i32: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: ldr r1, [sp, #8] +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r0, r2, lr +; DEFAULT-NEXT: bic lr, r1, lr +; DEFAULT-NEXT: ldr r2, [sp, #12] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r1, r3, lr +; DEFAULT-NEXT: bic lr, r2, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: pop {r11, pc} +; +; THUMB1-LABEL: ct_v2i32: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r7, lr} +; THUMB1-NEXT: push {r4, r5, r7, lr} +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #16] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ldr r2, [sp, #20] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: pop {r4, r5, r7, pc} +; +; THUMB2-LABEL: ct_v2i32: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r7, lr} +; THUMB2-NEXT: push {r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldr r1, [sp, #8] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldr r2, [sp, #12] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: pop {r7, pc} +entry: + %sel = call <2 x i32> @llvm.ct.select.v2i32(i1 %cond, <2 x i32> %a, <2 x i32> %b) + ret <2 x i32> %sel +} + +define <1 x i64> @ct_v1i64(i1 %cond, <1 x i64> %a, <1 x i64> %b) { +; CT-LABEL: ct_v1i64: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vldr d16, [sp] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vmov d17, r2, r3 +; CT-NEXT: vdup.32 d19, r1 +; CT-NEXT: vand d18, d17, d19 +; CT-NEXT: vbic d19, d16, d19 +; CT-NEXT: vorr d18, d18, d19 +; CT-NEXT: vmov r0, r1, d18 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v1i64: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: ldr r1, [sp, #8] +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r0, r2, lr +; DEFAULT-NEXT: bic lr, r1, lr +; DEFAULT-NEXT: ldr r2, [sp, #12] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r1, r3, lr +; DEFAULT-NEXT: bic lr, r2, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: pop {r11, pc} +; +; THUMB1-LABEL: ct_v1i64: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r7, lr} +; THUMB1-NEXT: push {r4, r5, r7, lr} +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #16] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ldr r2, [sp, #20] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: pop {r4, r5, r7, pc} +; +; THUMB2-LABEL: ct_v1i64: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r7, lr} +; THUMB2-NEXT: push {r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldr r1, [sp, #8] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldr r2, [sp, #12] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: pop {r7, pc} +entry: + %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cond, <1 x i64> %a, <1 x i64> %b) + ret <1 x i64> %sel +} + +define <2 x float> @ct_v2f32(i1 %cond, <2 x float> %a, <2 x float> %b) { +; CT-LABEL: ct_v2f32: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vldr d16, [sp] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vmov d17, r2, r3 +; CT-NEXT: vdup.32 d19, r1 +; CT-NEXT: vand d18, d17, d19 +; CT-NEXT: vbic d19, d16, d19 +; CT-NEXT: vorr d18, d18, d19 +; CT-NEXT: vmov r0, r1, d18 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v2f32: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: ldr r1, [sp, #8] +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r0, r2, lr +; DEFAULT-NEXT: bic lr, r1, lr +; DEFAULT-NEXT: ldr r2, [sp, #12] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r1, r3, lr +; DEFAULT-NEXT: bic lr, r2, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: pop {r11, pc} +; +; THUMB1-LABEL: ct_v2f32: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r7, lr} +; THUMB1-NEXT: push {r4, r5, r7, lr} +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #16] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ldr r2, [sp, #20] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: pop {r4, r5, r7, pc} +; +; THUMB2-LABEL: ct_v2f32: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r7, lr} +; THUMB2-NEXT: push {r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldr r1, [sp, #8] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldr r2, [sp, #12] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: pop {r7, pc} +entry: + %sel = call <2 x float> @llvm.ct.select.v2f32(i1 %cond, <2 x float> %a, <2 x float> %b) + ret <2 x float> %sel +} + +define <16 x i8> @ct_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) { +; CT-LABEL: ct_v16i8: +; CT: @ %bb.0: @ %entry +; CT-NEXT: add r1, sp, #8 +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vld1.64 {d18, d19}, [r1] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vldr d17, [sp] +; CT-NEXT: vmov d16, r2, r3 +; CT-NEXT: vdup.32 q11, r1 +; CT-NEXT: vand q10, q8, q11 +; CT-NEXT: vbic q11, q9, q11 +; CT-NEXT: vorr q10, q10, q11 +; CT-NEXT: vmov r0, r1, d20 +; CT-NEXT: vmov r2, r3, d21 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v16i8: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r4, r5, r11, lr} +; DEFAULT-NEXT: and lr, r1, #1 +; DEFAULT-NEXT: ldrb r12, [sp, #132] +; DEFAULT-NEXT: ldrb r1, [sp, #68] +; DEFAULT-NEXT: rsb r5, lr, #0 +; DEFAULT-NEXT: and r4, r1, r5 +; DEFAULT-NEXT: bic r5, r12, r5 +; DEFAULT-NEXT: orr r4, r4, r5 +; DEFAULT-NEXT: ldrb r12, [sp, #128] +; DEFAULT-NEXT: ldrb r5, [sp, #64] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #15] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #124] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #60] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #14] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #120] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #56] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #13] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #116] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #52] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #12] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #112] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #48] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #11] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #108] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #44] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #10] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #104] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #40] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #9] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #100] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #36] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #8] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #96] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #32] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #7] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #92] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #28] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #6] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #88] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #24] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #5] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #84] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #20] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #4] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrb r12, [sp, #80] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r5, [sp, #16] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strb r4, [r0, #3] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrb r1, [sp, #76] +; DEFAULT-NEXT: strb r4, [r0, #2] +; DEFAULT-NEXT: rsb r4, lr, #0 +; DEFAULT-NEXT: and r5, r3, r4 +; DEFAULT-NEXT: bic r4, r1, r4 +; DEFAULT-NEXT: orr r5, r5, r4 +; DEFAULT-NEXT: ldrb r1, [sp, #72] +; DEFAULT-NEXT: strb r5, [r0, #1] +; DEFAULT-NEXT: rsb r5, lr, #0 +; DEFAULT-NEXT: and r3, r2, r5 +; DEFAULT-NEXT: bic r5, r1, r5 +; DEFAULT-NEXT: orr r3, r3, r5 +; DEFAULT-NEXT: strb r3, [r0] +; DEFAULT-NEXT: pop {r4, r5, r11, pc} +; +; THUMB1-LABEL: ct_v16i8: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r1 +; THUMB1-NEXT: ldr r1, [sp, #140] +; THUMB1-NEXT: ldr r5, [sp, #76] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strb r6, [r0, #15] +; THUMB1-NEXT: ldr r1, [sp, #136] +; THUMB1-NEXT: ldr r5, [sp, #72] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strb r6, [r0, #14] +; THUMB1-NEXT: ldr r1, [sp, #132] +; THUMB1-NEXT: ldr r5, [sp, #68] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strb r6, [r0, #13] +; THUMB1-NEXT: ldr r1, [sp, #128] +; THUMB1-NEXT: ldr r5, [sp, #64] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strb r6, [r0, #12] +; THUMB1-NEXT: ldr r1, [sp, #124] +; THUMB1-NEXT: ldr r5, [sp, #60] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strb r6, [r0, #11] +; THUMB1-NEXT: ldr r1, [sp, #120] +; THUMB1-NEXT: ldr r5, [sp, #56] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strb r6, [r0, #10] +; THUMB1-NEXT: ldr r1, [sp, #116] +; THUMB1-NEXT: ldr r5, [sp, #52] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strb r6, [r0, #9] +; THUMB1-NEXT: ldr r1, [sp, #112] +; THUMB1-NEXT: ldr r5, [sp, #48] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strb r6, [r0, #8] +; THUMB1-NEXT: ldr r1, [sp, #108] +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strb r6, [r0, #7] +; THUMB1-NEXT: ldr r1, [sp, #104] +; THUMB1-NEXT: ldr r5, [sp, #40] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strb r6, [r0, #6] +; THUMB1-NEXT: ldr r1, [sp, #100] +; THUMB1-NEXT: ldr r5, [sp, #36] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strb r6, [r0, #5] +; THUMB1-NEXT: ldr r1, [sp, #96] +; THUMB1-NEXT: ldr r5, [sp, #32] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strb r6, [r0, #4] +; THUMB1-NEXT: ldr r1, [sp, #92] +; THUMB1-NEXT: ldr r5, [sp, #28] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strb r6, [r0, #3] +; THUMB1-NEXT: ldr r1, [sp, #88] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strb r6, [r0, #2] +; THUMB1-NEXT: ldr r1, [sp, #84] +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 +; THUMB1-NEXT: mov r5, r3 +; THUMB1-NEXT: eors r5, r1 +; THUMB1-NEXT: ands r5, r6 +; THUMB1-NEXT: eors r5, r1 +; THUMB1-NEXT: strb r5, [r0, #1] +; THUMB1-NEXT: ldr r1, [sp, #80] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: eors r3, r1 +; THUMB1-NEXT: ands r3, r5 +; THUMB1-NEXT: eors r3, r1 +; THUMB1-NEXT: strb r3, [r0] +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v16i8: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and lr, r1, #1 +; THUMB2-NEXT: ldrb.w r12, [sp, #132] +; THUMB2-NEXT: ldrb.w r1, [sp, #68] +; THUMB2-NEXT: rsb.w r5, lr, #0 +; THUMB2-NEXT: and.w r4, r1, r5 +; THUMB2-NEXT: bic.w r5, r12, r5 +; THUMB2-NEXT: orrs r4, r5 +; THUMB2-NEXT: ldrb.w r12, [sp, #128] +; THUMB2-NEXT: ldrb.w r5, [sp, #64] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #15] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #124] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #60] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #14] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #120] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #56] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #13] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #116] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #52] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #12] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #112] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #48] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #11] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #108] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #44] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #10] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #104] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #40] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #9] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #100] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #36] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #8] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #96] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #32] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #7] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #92] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #28] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #6] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #88] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #24] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #5] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #84] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #20] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #4] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrb.w r12, [sp, #80] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r5, [sp, #16] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strb r4, [r0, #3] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrb.w r1, [sp, #76] +; THUMB2-NEXT: strb r4, [r0, #2] +; THUMB2-NEXT: rsb.w r4, lr, #0 +; THUMB2-NEXT: and.w r5, r3, r4 +; THUMB2-NEXT: bic.w r4, r1, r4 +; THUMB2-NEXT: orrs r5, r4 +; THUMB2-NEXT: ldrb.w r1, [sp, #72] +; THUMB2-NEXT: strb r5, [r0, #1] +; THUMB2-NEXT: rsb.w r5, lr, #0 +; THUMB2-NEXT: and.w r3, r2, r5 +; THUMB2-NEXT: bic.w r5, r1, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: strb r3, [r0] +; THUMB2-NEXT: pop {r4, r5, r7, pc} +entry: + %sel = call <16 x i8> @llvm.ct.select.v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %sel +} + +define <8 x i16> @ct_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) { +; CT-LABEL: ct_v8i16: +; CT: @ %bb.0: @ %entry +; CT-NEXT: add r1, sp, #8 +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vld1.64 {d18, d19}, [r1] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vldr d17, [sp] +; CT-NEXT: vmov d16, r2, r3 +; CT-NEXT: vdup.32 q11, r1 +; CT-NEXT: vand q10, q8, q11 +; CT-NEXT: vbic q11, q9, q11 +; CT-NEXT: vorr q10, q10, q11 +; CT-NEXT: vmov r0, r1, d20 +; CT-NEXT: vmov r2, r3, d21 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v8i16: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r4, r5, r11, lr} +; DEFAULT-NEXT: and lr, r1, #1 +; DEFAULT-NEXT: ldrh r12, [sp, #68] +; DEFAULT-NEXT: ldrh r1, [sp, #36] +; DEFAULT-NEXT: rsb r5, lr, #0 +; DEFAULT-NEXT: and r4, r1, r5 +; DEFAULT-NEXT: bic r5, r12, r5 +; DEFAULT-NEXT: orr r4, r4, r5 +; DEFAULT-NEXT: ldrh r12, [sp, #64] +; DEFAULT-NEXT: ldrh r5, [sp, #32] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strh r4, [r0, #14] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrh r12, [sp, #60] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrh r5, [sp, #28] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strh r4, [r0, #12] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrh r12, [sp, #56] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrh r5, [sp, #24] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strh r4, [r0, #10] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrh r12, [sp, #52] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrh r5, [sp, #20] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strh r4, [r0, #8] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: ldrh r12, [sp, #48] +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrh r5, [sp, #16] +; DEFAULT-NEXT: rsb r1, lr, #0 +; DEFAULT-NEXT: strh r4, [r0, #6] +; DEFAULT-NEXT: and r4, r5, r1 +; DEFAULT-NEXT: bic r1, r12, r1 +; DEFAULT-NEXT: orr r4, r4, r1 +; DEFAULT-NEXT: ldrh r1, [sp, #44] +; DEFAULT-NEXT: strh r4, [r0, #4] +; DEFAULT-NEXT: rsb r4, lr, #0 +; DEFAULT-NEXT: and r5, r3, r4 +; DEFAULT-NEXT: bic r4, r1, r4 +; DEFAULT-NEXT: orr r5, r5, r4 +; DEFAULT-NEXT: ldrh r1, [sp, #40] +; DEFAULT-NEXT: strh r5, [r0, #2] +; DEFAULT-NEXT: rsb r5, lr, #0 +; DEFAULT-NEXT: and r3, r2, r5 +; DEFAULT-NEXT: bic r5, r1, r5 +; DEFAULT-NEXT: orr r3, r3, r5 +; DEFAULT-NEXT: strh r3, [r0] +; DEFAULT-NEXT: pop {r4, r5, r11, pc} +; +; THUMB1-LABEL: ct_v8i16: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r1 +; THUMB1-NEXT: ldr r1, [sp, #76] +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strh r6, [r0, #14] +; THUMB1-NEXT: ldr r1, [sp, #72] +; THUMB1-NEXT: ldr r5, [sp, #40] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strh r6, [r0, #12] +; THUMB1-NEXT: ldr r1, [sp, #68] +; THUMB1-NEXT: ldr r5, [sp, #36] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strh r6, [r0, #10] +; THUMB1-NEXT: ldr r1, [sp, #64] +; THUMB1-NEXT: ldr r5, [sp, #32] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strh r6, [r0, #8] +; THUMB1-NEXT: ldr r1, [sp, #60] +; THUMB1-NEXT: ldr r5, [sp, #28] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strh r6, [r0, #6] +; THUMB1-NEXT: ldr r1, [sp, #56] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r6, r5 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: ands r6, r7 +; THUMB1-NEXT: eors r6, r1 +; THUMB1-NEXT: strh r6, [r0, #4] +; THUMB1-NEXT: ldr r1, [sp, #52] +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 +; THUMB1-NEXT: mov r5, r3 +; THUMB1-NEXT: eors r5, r1 +; THUMB1-NEXT: ands r5, r6 +; THUMB1-NEXT: eors r5, r1 +; THUMB1-NEXT: strh r5, [r0, #2] +; THUMB1-NEXT: ldr r1, [sp, #48] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r3, r2 +; THUMB1-NEXT: eors r3, r1 +; THUMB1-NEXT: ands r3, r5 +; THUMB1-NEXT: eors r3, r1 +; THUMB1-NEXT: strh r3, [r0] +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v8i16: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and lr, r1, #1 +; THUMB2-NEXT: ldrh.w r12, [sp, #68] +; THUMB2-NEXT: ldrh.w r1, [sp, #36] +; THUMB2-NEXT: rsb.w r5, lr, #0 +; THUMB2-NEXT: and.w r4, r1, r5 +; THUMB2-NEXT: bic.w r5, r12, r5 +; THUMB2-NEXT: orrs r4, r5 +; THUMB2-NEXT: ldrh.w r12, [sp, #64] +; THUMB2-NEXT: ldrh.w r5, [sp, #32] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #14] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #60] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #28] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #12] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #56] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #24] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #10] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #52] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #20] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #8] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: ldrh.w r12, [sp, #48] +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r5, [sp, #16] +; THUMB2-NEXT: rsb.w r1, lr, #0 +; THUMB2-NEXT: strh r4, [r0, #6] +; THUMB2-NEXT: and.w r4, r5, r1 +; THUMB2-NEXT: bic.w r1, r12, r1 +; THUMB2-NEXT: orrs r4, r1 +; THUMB2-NEXT: ldrh.w r1, [sp, #44] +; THUMB2-NEXT: strh r4, [r0, #4] +; THUMB2-NEXT: rsb.w r4, lr, #0 +; THUMB2-NEXT: and.w r5, r3, r4 +; THUMB2-NEXT: bic.w r4, r1, r4 +; THUMB2-NEXT: orrs r5, r4 +; THUMB2-NEXT: ldrh.w r1, [sp, #40] +; THUMB2-NEXT: strh r5, [r0, #2] +; THUMB2-NEXT: rsb.w r5, lr, #0 +; THUMB2-NEXT: and.w r3, r2, r5 +; THUMB2-NEXT: bic.w r5, r1, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: strh r3, [r0] +; THUMB2-NEXT: pop {r4, r5, r7, pc} +entry: + %sel = call <8 x i16> @llvm.ct.select.v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %sel +} + +define <4 x i32> @ct_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { +; CT-LABEL: ct_v4i32: +; CT: @ %bb.0: @ %entry +; CT-NEXT: add r1, sp, #8 +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vld1.64 {d18, d19}, [r1] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vldr d17, [sp] +; CT-NEXT: vmov d16, r2, r3 +; CT-NEXT: vdup.32 q11, r1 +; CT-NEXT: vand q10, q8, q11 +; CT-NEXT: vbic q11, q9, q11 +; CT-NEXT: vorr q10, q10, q11 +; CT-NEXT: vmov r0, r1, d20 +; CT-NEXT: vmov r2, r3, d21 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v4i32: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r4, r5, r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: ldr r1, [sp, #24] +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: rsb r4, r12, #0 +; DEFAULT-NEXT: and r0, r2, lr +; DEFAULT-NEXT: bic lr, r1, lr +; DEFAULT-NEXT: ldr r2, [sp, #28] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: rsb r5, r12, #0 +; DEFAULT-NEXT: and r1, r3, lr +; DEFAULT-NEXT: bic lr, r2, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: ldr r3, [sp, #16] +; DEFAULT-NEXT: ldr lr, [sp, #32] +; DEFAULT-NEXT: and r2, r3, r4 +; DEFAULT-NEXT: bic r4, lr, r4 +; DEFAULT-NEXT: ldr lr, [sp, #36] +; DEFAULT-NEXT: orr r2, r2, r4 +; DEFAULT-NEXT: ldr r4, [sp, #20] +; DEFAULT-NEXT: and r3, r4, r5 +; DEFAULT-NEXT: bic r5, lr, r5 +; DEFAULT-NEXT: orr r3, r3, r5 +; DEFAULT-NEXT: pop {r4, r5, r11, pc} +; +; THUMB1-LABEL: ct_v4i32: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #32] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ldr r2, [sp, #36] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: ldr r3, [sp, #40] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 +; THUMB1-NEXT: mov r2, r5 +; THUMB1-NEXT: eors r2, r3 +; THUMB1-NEXT: ands r2, r6 +; THUMB1-NEXT: eors r2, r3 +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: ldr r6, [sp, #28] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: eors r3, r5 +; THUMB1-NEXT: ands r3, r7 +; THUMB1-NEXT: eors r3, r5 +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v4i32: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldr r1, [sp, #24] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r4, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldr r2, [sp, #28] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r5, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: ldr r3, [sp, #16] +; THUMB2-NEXT: ldr.w lr, [sp, #32] +; THUMB2-NEXT: and.w r2, r3, r4 +; THUMB2-NEXT: bic.w r4, lr, r4 +; THUMB2-NEXT: ldr.w lr, [sp, #36] +; THUMB2-NEXT: orrs r2, r4 +; THUMB2-NEXT: ldr r4, [sp, #20] +; THUMB2-NEXT: and.w r3, r4, r5 +; THUMB2-NEXT: bic.w r5, lr, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: pop {r4, r5, r7, pc} +entry: + %sel = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %sel +} + +define <2 x i64> @ct_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { +; CT-LABEL: ct_v2i64: +; CT: @ %bb.0: @ %entry +; CT-NEXT: add r1, sp, #8 +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vld1.64 {d18, d19}, [r1] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vldr d17, [sp] +; CT-NEXT: vmov d16, r2, r3 +; CT-NEXT: vdup.32 q11, r1 +; CT-NEXT: vand q10, q8, q11 +; CT-NEXT: vbic q11, q9, q11 +; CT-NEXT: vorr q10, q10, q11 +; CT-NEXT: vmov r0, r1, d20 +; CT-NEXT: vmov r2, r3, d21 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v2i64: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r4, r5, r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: ldr r1, [sp, #24] +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: rsb r4, r12, #0 +; DEFAULT-NEXT: and r0, r2, lr +; DEFAULT-NEXT: bic lr, r1, lr +; DEFAULT-NEXT: ldr r2, [sp, #28] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: rsb r5, r12, #0 +; DEFAULT-NEXT: and r1, r3, lr +; DEFAULT-NEXT: bic lr, r2, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: ldr r3, [sp, #16] +; DEFAULT-NEXT: ldr lr, [sp, #32] +; DEFAULT-NEXT: and r2, r3, r4 +; DEFAULT-NEXT: bic r4, lr, r4 +; DEFAULT-NEXT: ldr lr, [sp, #36] +; DEFAULT-NEXT: orr r2, r2, r4 +; DEFAULT-NEXT: ldr r4, [sp, #20] +; DEFAULT-NEXT: and r3, r4, r5 +; DEFAULT-NEXT: bic r5, lr, r5 +; DEFAULT-NEXT: orr r3, r3, r5 +; DEFAULT-NEXT: pop {r4, r5, r11, pc} +; +; THUMB1-LABEL: ct_v2i64: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #32] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ldr r2, [sp, #36] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: ldr r3, [sp, #40] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 +; THUMB1-NEXT: mov r2, r5 +; THUMB1-NEXT: eors r2, r3 +; THUMB1-NEXT: ands r2, r6 +; THUMB1-NEXT: eors r2, r3 +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: ldr r6, [sp, #28] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: eors r3, r5 +; THUMB1-NEXT: ands r3, r7 +; THUMB1-NEXT: eors r3, r5 +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v2i64: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldr r1, [sp, #24] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r4, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldr r2, [sp, #28] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r5, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: ldr r3, [sp, #16] +; THUMB2-NEXT: ldr.w lr, [sp, #32] +; THUMB2-NEXT: and.w r2, r3, r4 +; THUMB2-NEXT: bic.w r4, lr, r4 +; THUMB2-NEXT: ldr.w lr, [sp, #36] +; THUMB2-NEXT: orrs r2, r4 +; THUMB2-NEXT: ldr r4, [sp, #20] +; THUMB2-NEXT: and.w r3, r4, r5 +; THUMB2-NEXT: bic.w r5, lr, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: pop {r4, r5, r7, pc} +entry: + %sel = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %sel +} + +define <4 x float> @ct_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) { +; CT-LABEL: ct_v4f32: +; CT: @ %bb.0: @ %entry +; CT-NEXT: add r1, sp, #8 +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vld1.64 {d18, d19}, [r1] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vldr d17, [sp] +; CT-NEXT: vmov d16, r2, r3 +; CT-NEXT: vdup.32 q11, r1 +; CT-NEXT: vand q10, q8, q11 +; CT-NEXT: vbic q11, q9, q11 +; CT-NEXT: vorr q10, q10, q11 +; CT-NEXT: vmov r0, r1, d20 +; CT-NEXT: vmov r2, r3, d21 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v4f32: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r4, r5, r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: ldr r1, [sp, #24] +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: rsb r4, r12, #0 +; DEFAULT-NEXT: and r0, r2, lr +; DEFAULT-NEXT: bic lr, r1, lr +; DEFAULT-NEXT: ldr r2, [sp, #28] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: rsb r5, r12, #0 +; DEFAULT-NEXT: and r1, r3, lr +; DEFAULT-NEXT: bic lr, r2, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: ldr r3, [sp, #16] +; DEFAULT-NEXT: ldr lr, [sp, #32] +; DEFAULT-NEXT: and r2, r3, r4 +; DEFAULT-NEXT: bic r4, lr, r4 +; DEFAULT-NEXT: ldr lr, [sp, #36] +; DEFAULT-NEXT: orr r2, r2, r4 +; DEFAULT-NEXT: ldr r4, [sp, #20] +; DEFAULT-NEXT: and r3, r4, r5 +; DEFAULT-NEXT: bic r5, lr, r5 +; DEFAULT-NEXT: orr r3, r3, r5 +; DEFAULT-NEXT: pop {r4, r5, r11, pc} +; +; THUMB1-LABEL: ct_v4f32: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #32] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ldr r2, [sp, #36] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: ldr r3, [sp, #40] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 +; THUMB1-NEXT: mov r2, r5 +; THUMB1-NEXT: eors r2, r3 +; THUMB1-NEXT: ands r2, r6 +; THUMB1-NEXT: eors r2, r3 +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: ldr r6, [sp, #28] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: eors r3, r5 +; THUMB1-NEXT: ands r3, r7 +; THUMB1-NEXT: eors r3, r5 +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v4f32: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldr r1, [sp, #24] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r4, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldr r2, [sp, #28] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r5, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: ldr r3, [sp, #16] +; THUMB2-NEXT: ldr.w lr, [sp, #32] +; THUMB2-NEXT: and.w r2, r3, r4 +; THUMB2-NEXT: bic.w r4, lr, r4 +; THUMB2-NEXT: ldr.w lr, [sp, #36] +; THUMB2-NEXT: orrs r2, r4 +; THUMB2-NEXT: ldr r4, [sp, #20] +; THUMB2-NEXT: and.w r3, r4, r5 +; THUMB2-NEXT: bic.w r5, lr, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: pop {r4, r5, r7, pc} +entry: + %sel = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) + ret <4 x float> %sel +} + +define <2 x double> @ct_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) { +; CT-LABEL: ct_v2f64: +; CT: @ %bb.0: @ %entry +; CT-NEXT: add r1, sp, #8 +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vld1.64 {d18, d19}, [r1] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vldr d17, [sp] +; CT-NEXT: vmov d16, r2, r3 +; CT-NEXT: vdup.32 q11, r1 +; CT-NEXT: vand q10, q8, q11 +; CT-NEXT: vbic q11, q9, q11 +; CT-NEXT: vorr q10, q10, q11 +; CT-NEXT: vmov r0, r1, d20 +; CT-NEXT: vmov r2, r3, d21 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v2f64: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r4, r5, r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: ldr r1, [sp, #24] +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: rsb r4, r12, #0 +; DEFAULT-NEXT: and r0, r2, lr +; DEFAULT-NEXT: bic lr, r1, lr +; DEFAULT-NEXT: ldr r2, [sp, #28] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: rsb r5, r12, #0 +; DEFAULT-NEXT: and r1, r3, lr +; DEFAULT-NEXT: bic lr, r2, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: ldr r3, [sp, #16] +; DEFAULT-NEXT: ldr lr, [sp, #32] +; DEFAULT-NEXT: and r2, r3, r4 +; DEFAULT-NEXT: bic r4, lr, r4 +; DEFAULT-NEXT: ldr lr, [sp, #36] +; DEFAULT-NEXT: orr r2, r2, r4 +; DEFAULT-NEXT: ldr r4, [sp, #20] +; DEFAULT-NEXT: and r3, r4, r5 +; DEFAULT-NEXT: bic r5, lr, r5 +; DEFAULT-NEXT: orr r3, r3, r5 +; DEFAULT-NEXT: pop {r4, r5, r11, pc} +; +; THUMB1-LABEL: ct_v2f64: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #32] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ldr r2, [sp, #36] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: ldr r3, [sp, #40] +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 +; THUMB1-NEXT: mov r2, r5 +; THUMB1-NEXT: eors r2, r3 +; THUMB1-NEXT: ands r2, r6 +; THUMB1-NEXT: eors r2, r3 +; THUMB1-NEXT: ldr r5, [sp, #44] +; THUMB1-NEXT: ldr r6, [sp, #28] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: eors r3, r5 +; THUMB1-NEXT: ands r3, r7 +; THUMB1-NEXT: eors r3, r5 +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v2f64: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldr r1, [sp, #24] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r4, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldr r2, [sp, #28] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: rsb.w r5, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: ldr r3, [sp, #16] +; THUMB2-NEXT: ldr.w lr, [sp, #32] +; THUMB2-NEXT: and.w r2, r3, r4 +; THUMB2-NEXT: bic.w r4, lr, r4 +; THUMB2-NEXT: ldr.w lr, [sp, #36] +; THUMB2-NEXT: orrs r2, r4 +; THUMB2-NEXT: ldr r4, [sp, #20] +; THUMB2-NEXT: and.w r3, r4, r5 +; THUMB2-NEXT: bic.w r5, lr, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: pop {r4, r5, r7, pc} +entry: + %sel = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) + ret <2 x double> %sel +} + +; +; itty bitty vector type edge cases follow. these should be scalarised. +; +define <1 x i8> @ct_v1i8(i1 %cond, <1 x i8> %a, <1 x i8> %b) { +; CT-LABEL: ct_v1i8: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r3, r0, #1 +; CT-NEXT: rsb r12, r3, #0 +; CT-NEXT: and r0, r1, r12 +; CT-NEXT: bic r12, r2, r12 +; CT-NEXT: orr r0, r0, r12 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v1i8: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: and r3, r0, #1 +; DEFAULT-NEXT: rsb r12, r3, #0 +; DEFAULT-NEXT: and r0, r1, r12 +; DEFAULT-NEXT: bic r12, r2, r12 +; DEFAULT-NEXT: orr r0, r0, r12 +; DEFAULT-NEXT: bx lr +; +; THUMB1-LABEL: ct_v1i8: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: movs r3, #1 +; THUMB1-NEXT: ands r3, r0 +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: lsls r4, r4, #31 +; THUMB1-NEXT: asrs r4, r4, #31 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r2 +; THUMB1-NEXT: ands r0, r4 +; THUMB1-NEXT: eors r0, r2 +; THUMB1-NEXT: pop {r4, pc} +; +; THUMB2-LABEL: ct_v1i8: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: and r3, r0, #1 +; THUMB2-NEXT: rsb.w r12, r3, #0 +; THUMB2-NEXT: and.w r0, r1, r12 +; THUMB2-NEXT: bic.w r12, r2, r12 +; THUMB2-NEXT: orr.w r0, r0, r12 +; THUMB2-NEXT: bx lr +entry: + %sel = call <1 x i8> @llvm.ct.select.i8(i1 %cond, <1 x i8> %a, <1 x i8> %b) + ret <1 x i8> %sel +} + +define <2 x i8> @ct_v2i8(i1 %cond, <2 x i8> %a, <2 x i8> %b) { +; CT-LABEL: ct_v2i8: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vldr d16, [sp] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vmov d17, r2, r3 +; CT-NEXT: vdup.32 d19, r1 +; CT-NEXT: vand d18, d17, d19 +; CT-NEXT: vbic d19, d16, d19 +; CT-NEXT: vorr d18, d18, d19 +; CT-NEXT: vmov r0, r1, d18 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v2i8: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r0, r1, lr +; DEFAULT-NEXT: bic lr, r3, lr +; DEFAULT-NEXT: ldrb r3, [sp, #8] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r1, r2, lr +; DEFAULT-NEXT: bic lr, r3, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: pop {r11, pc} +; +; THUMB1-LABEL: ct_v2i8: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r7, lr} +; THUMB1-NEXT: push {r4, r5, r7, lr} +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r3 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: eors r0, r3 +; THUMB1-NEXT: ldr r3, [sp, #16] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r1, r2 +; THUMB1-NEXT: eors r1, r3 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: eors r1, r3 +; THUMB1-NEXT: pop {r4, r5, r7, pc} +; +; THUMB2-LABEL: ct_v2i8: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r7, lr} +; THUMB2-NEXT: push {r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r0, r1, lr +; THUMB2-NEXT: bic.w lr, r3, lr +; THUMB2-NEXT: ldrb.w r3, [sp, #8] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r1, r2, lr +; THUMB2-NEXT: bic.w lr, r3, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: pop {r7, pc} +entry: + %sel = call <2 x i8> @llvm.ct.select.i16(i1 %cond, <2 x i8> %a, <2 x i8> %b) + ret <2 x i8> %sel +} + +define <4 x i8> @ct_v4i8(i1 %cond, <4 x i8> %a, <4 x i8> %b) { +; CT-LABEL: ct_v4i8: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vldr d16, [sp] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vmov d17, r2, r3 +; CT-NEXT: vdup.32 d19, r1 +; CT-NEXT: vand d18, d17, d19 +; CT-NEXT: vbic d19, d16, d19 +; CT-NEXT: vorr d18, d18, d19 +; CT-NEXT: vmov r0, r1, d18 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v4i8: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r4, r5, r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: ldrb lr, [sp, #20] +; DEFAULT-NEXT: rsb r4, r12, #0 +; DEFAULT-NEXT: rsb r5, r12, #0 +; DEFAULT-NEXT: and r0, r1, r4 +; DEFAULT-NEXT: bic r4, lr, r4 +; DEFAULT-NEXT: orr r0, r0, r4 +; DEFAULT-NEXT: ldrb r4, [sp, #24] +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r1, r2, lr +; DEFAULT-NEXT: bic lr, r4, lr +; DEFAULT-NEXT: ldrb r4, [sp, #28] +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r2, r3, lr +; DEFAULT-NEXT: bic lr, r4, lr +; DEFAULT-NEXT: orr r2, r2, lr +; DEFAULT-NEXT: ldrb r4, [sp, #16] +; DEFAULT-NEXT: ldrb lr, [sp, #32] +; DEFAULT-NEXT: and r3, r4, r5 +; DEFAULT-NEXT: bic r5, lr, r5 +; DEFAULT-NEXT: orr r3, r3, r5 +; DEFAULT-NEXT: pop {r4, r5, r11, pc} +; +; THUMB1-LABEL: ct_v4i8: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r6, r7, lr} +; THUMB1-NEXT: push {r4, r5, r6, r7, lr} +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r5, [sp, #24] +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r5 +; THUMB1-NEXT: ands r0, r6 +; THUMB1-NEXT: eors r0, r5 +; THUMB1-NEXT: ldr r5, [sp, #28] +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 +; THUMB1-NEXT: mov r1, r2 +; THUMB1-NEXT: eors r1, r5 +; THUMB1-NEXT: ands r1, r6 +; THUMB1-NEXT: eors r1, r5 +; THUMB1-NEXT: ldr r5, [sp, #32] +; THUMB1-NEXT: mov r6, r4 +; THUMB1-NEXT: lsls r6, r6, #31 +; THUMB1-NEXT: asrs r6, r6, #31 +; THUMB1-NEXT: mov r2, r3 +; THUMB1-NEXT: eors r2, r5 +; THUMB1-NEXT: ands r2, r6 +; THUMB1-NEXT: eors r2, r5 +; THUMB1-NEXT: ldr r5, [sp, #36] +; THUMB1-NEXT: ldr r6, [sp, #20] +; THUMB1-NEXT: mov r7, r4 +; THUMB1-NEXT: lsls r7, r7, #31 +; THUMB1-NEXT: asrs r7, r7, #31 +; THUMB1-NEXT: mov r3, r6 +; THUMB1-NEXT: eors r3, r5 +; THUMB1-NEXT: ands r3, r7 +; THUMB1-NEXT: eors r3, r5 +; THUMB1-NEXT: pop {r4, r5, r6, r7, pc} +; +; THUMB2-LABEL: ct_v4i8: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, r5, r7, lr} +; THUMB2-NEXT: push {r4, r5, r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldrb.w lr, [sp, #20] +; THUMB2-NEXT: rsb.w r4, r12, #0 +; THUMB2-NEXT: rsb.w r5, r12, #0 +; THUMB2-NEXT: and.w r0, r1, r4 +; THUMB2-NEXT: bic.w r4, lr, r4 +; THUMB2-NEXT: orrs r0, r4 +; THUMB2-NEXT: ldrb.w r4, [sp, #24] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r1, r2, lr +; THUMB2-NEXT: bic.w lr, r4, lr +; THUMB2-NEXT: ldrb.w r4, [sp, #28] +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r2, r3, lr +; THUMB2-NEXT: bic.w lr, r4, lr +; THUMB2-NEXT: orr.w r2, r2, lr +; THUMB2-NEXT: ldrb.w r4, [sp, #16] +; THUMB2-NEXT: ldrb.w lr, [sp, #32] +; THUMB2-NEXT: and.w r3, r4, r5 +; THUMB2-NEXT: bic.w r5, lr, r5 +; THUMB2-NEXT: orrs r3, r5 +; THUMB2-NEXT: pop {r4, r5, r7, pc} +entry: + %sel = call <4 x i8> @llvm.ct.select.i32(i1 %cond, <4 x i8> %a, <4 x i8> %b) + ret <4 x i8> %sel +} + +define <1 x i16> @ct_v1i16(i1 %cond, <1 x i16> %a, <1 x i16> %b) { +; CT-LABEL: ct_v1i16: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r3, r0, #1 +; CT-NEXT: rsb r12, r3, #0 +; CT-NEXT: and r0, r1, r12 +; CT-NEXT: bic r12, r2, r12 +; CT-NEXT: orr r0, r0, r12 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v1i16: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: and r3, r0, #1 +; DEFAULT-NEXT: rsb r12, r3, #0 +; DEFAULT-NEXT: and r0, r1, r12 +; DEFAULT-NEXT: bic r12, r2, r12 +; DEFAULT-NEXT: orr r0, r0, r12 +; DEFAULT-NEXT: bx lr +; +; THUMB1-LABEL: ct_v1i16: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: movs r3, #1 +; THUMB1-NEXT: ands r3, r0 +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: lsls r4, r4, #31 +; THUMB1-NEXT: asrs r4, r4, #31 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r2 +; THUMB1-NEXT: ands r0, r4 +; THUMB1-NEXT: eors r0, r2 +; THUMB1-NEXT: pop {r4, pc} +; +; THUMB2-LABEL: ct_v1i16: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: and r3, r0, #1 +; THUMB2-NEXT: rsb.w r12, r3, #0 +; THUMB2-NEXT: and.w r0, r1, r12 +; THUMB2-NEXT: bic.w r12, r2, r12 +; THUMB2-NEXT: orr.w r0, r0, r12 +; THUMB2-NEXT: bx lr +entry: + %sel = call <1 x i16> @llvm.ct.select.i16(i1 %cond, <1 x i16> %a, <1 x i16> %b) + ret <1 x i16> %sel +} + +define <2 x i16> @ct_v2i16(i1 %cond, <2 x i16> %a, <2 x i16> %b) { +; CT-LABEL: ct_v2i16: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vldr d16, [sp] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vmov d17, r2, r3 +; CT-NEXT: vdup.32 d19, r1 +; CT-NEXT: vand d18, d17, d19 +; CT-NEXT: vbic d19, d16, d19 +; CT-NEXT: vorr d18, d18, d19 +; CT-NEXT: vmov r0, r1, d18 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v2i16: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r0, r1, lr +; DEFAULT-NEXT: bic lr, r3, lr +; DEFAULT-NEXT: ldrh r3, [sp, #8] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r1, r2, lr +; DEFAULT-NEXT: bic lr, r3, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: pop {r11, pc} +; +; THUMB1-LABEL: ct_v2i16: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r7, lr} +; THUMB1-NEXT: push {r4, r5, r7, lr} +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r3 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: eors r0, r3 +; THUMB1-NEXT: ldr r3, [sp, #16] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r1, r2 +; THUMB1-NEXT: eors r1, r3 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: eors r1, r3 +; THUMB1-NEXT: pop {r4, r5, r7, pc} +; +; THUMB2-LABEL: ct_v2i16: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r7, lr} +; THUMB2-NEXT: push {r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r0, r1, lr +; THUMB2-NEXT: bic.w lr, r3, lr +; THUMB2-NEXT: ldrh.w r3, [sp, #8] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r1, r2, lr +; THUMB2-NEXT: bic.w lr, r3, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: pop {r7, pc} +entry: + %sel = call <2 x i16> @llvm.ct.select.i32(i1 %cond, <2 x i16> %a, <2 x i16> %b) + ret <2 x i16> %sel +} + +define <1 x i32> @ct_v1i32(i1 %cond, <1 x i32> %a, <1 x i32> %b) { +; CT-LABEL: ct_v1i32: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r3, r0, #1 +; CT-NEXT: rsb r12, r3, #0 +; CT-NEXT: and r0, r1, r12 +; CT-NEXT: bic r12, r2, r12 +; CT-NEXT: orr r0, r0, r12 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v1i32: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: and r3, r0, #1 +; DEFAULT-NEXT: rsb r12, r3, #0 +; DEFAULT-NEXT: and r0, r1, r12 +; DEFAULT-NEXT: bic r12, r2, r12 +; DEFAULT-NEXT: orr r0, r0, r12 +; DEFAULT-NEXT: bx lr +; +; THUMB1-LABEL: ct_v1i32: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: movs r3, #1 +; THUMB1-NEXT: ands r3, r0 +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: lsls r4, r4, #31 +; THUMB1-NEXT: asrs r4, r4, #31 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r2 +; THUMB1-NEXT: ands r0, r4 +; THUMB1-NEXT: eors r0, r2 +; THUMB1-NEXT: pop {r4, pc} +; +; THUMB2-LABEL: ct_v1i32: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: and r3, r0, #1 +; THUMB2-NEXT: rsb.w r12, r3, #0 +; THUMB2-NEXT: and.w r0, r1, r12 +; THUMB2-NEXT: bic.w r12, r2, r12 +; THUMB2-NEXT: orr.w r0, r0, r12 +; THUMB2-NEXT: bx lr +entry: + %sel = call <1 x i32> @llvm.ct.select.i32(i1 %cond, <1 x i32> %a, <1 x i32> %b) + ret <1 x i32> %sel +} + +define <1 x float> @ct_v1f32(i1 %cond, <1 x float> %a, <1 x float> %b) { +; CT-LABEL: ct_v1f32: +; CT: @ %bb.0: @ %entry +; CT-NEXT: vmov s0, r2 +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vmov s2, r1 +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vmov r3, s0 +; CT-NEXT: vmov r2, s2 +; CT-NEXT: and r2, r2, r1 +; CT-NEXT: bic r1, r3, r1 +; CT-NEXT: orr r2, r2, r1 +; CT-NEXT: vmov s4, r2 +; CT-NEXT: vmov r0, s4 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_v1f32: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: and r3, r0, #1 +; DEFAULT-NEXT: rsb r12, r3, #0 +; DEFAULT-NEXT: and r0, r1, r12 +; DEFAULT-NEXT: bic r12, r2, r12 +; DEFAULT-NEXT: orr r0, r0, r12 +; DEFAULT-NEXT: bx lr +; +; THUMB1-LABEL: ct_v1f32: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: movs r3, #1 +; THUMB1-NEXT: ands r3, r0 +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: lsls r4, r4, #31 +; THUMB1-NEXT: asrs r4, r4, #31 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r2 +; THUMB1-NEXT: ands r0, r4 +; THUMB1-NEXT: eors r0, r2 +; THUMB1-NEXT: pop {r4, pc} +; +; THUMB2-LABEL: ct_v1f32: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: and r3, r0, #1 +; THUMB2-NEXT: rsb.w r12, r3, #0 +; THUMB2-NEXT: and.w r0, r1, r12 +; THUMB2-NEXT: bic.w r12, r2, r12 +; THUMB2-NEXT: orr.w r0, r0, r12 +; THUMB2-NEXT: bx lr +entry: + %sel = call <1 x float> @llvm.ct.select.f32(i1 %cond, <1 x float> %a, <1 x float> %b) + ret <1 x float> %sel +} diff --git a/llvm/test/CodeGen/ARM/ctselect.ll b/llvm/test/CodeGen/ARM/ctselect.ll new file mode 100644 index 0000000000000..40e17cb135627 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ctselect.ll @@ -0,0 +1,555 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=armv7-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=CT %s +; RUN: llc < %s -mtriple=armv6 -verify-machineinstrs | FileCheck --check-prefix=DEFAULT %s +; RUN: llc < %s -mtriple=thumbv6m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB1 %s +; RUN: llc < %s -mtriple=thumbv7m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB2 %s +; RUN: llc < %s -mtriple=thumbv7-linux-gnueabihf -mcpu=cortex-a9 -verify-machineinstrs | FileCheck --check-prefix=CORTEXA9 %s +; RUN: llc < %s -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 -verify-machineinstrs | FileCheck --check-prefix=CORTEX-NOTHUMB %s + +define i1 @ct_i1(i1 %cond, i1 %a, i1 %b) { +; CT-LABEL: ct_i1: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r3, r0, #1 +; CT-NEXT: rsb r12, r3, #0 +; CT-NEXT: and r0, r1, r12 +; CT-NEXT: bic r12, r2, r12 +; CT-NEXT: orr r0, r0, r12 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_i1: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: and r3, r0, #1 +; DEFAULT-NEXT: rsb r12, r3, #0 +; DEFAULT-NEXT: and r0, r1, r12 +; DEFAULT-NEXT: bic r12, r2, r12 +; DEFAULT-NEXT: orr r0, r0, r12 +; DEFAULT-NEXT: bx lr +; +; THUMB1-LABEL: ct_i1: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: movs r3, #1 +; THUMB1-NEXT: ands r3, r0 +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: lsls r4, r4, #31 +; THUMB1-NEXT: asrs r4, r4, #31 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r2 +; THUMB1-NEXT: ands r0, r4 +; THUMB1-NEXT: eors r0, r2 +; THUMB1-NEXT: pop {r4, pc} +; +; THUMB2-LABEL: ct_i1: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: and r3, r0, #1 +; THUMB2-NEXT: rsb.w r12, r3, #0 +; THUMB2-NEXT: and.w r0, r1, r12 +; THUMB2-NEXT: bic.w r12, r2, r12 +; THUMB2-NEXT: orr.w r0, r0, r12 +; THUMB2-NEXT: bx lr +; +; CORTEXA9-LABEL: ct_i1: +; CORTEXA9: @ %bb.0: @ %entry +; CORTEXA9-NEXT: and r3, r0, #1 +; CORTEXA9-NEXT: rsb.w r12, r3, #0 +; CORTEXA9-NEXT: and.w r0, r1, r12 +; CORTEXA9-NEXT: bic.w r12, r2, r12 +; CORTEXA9-NEXT: orr.w r0, r0, r12 +; CORTEXA9-NEXT: bx lr +; +; CORTEX-NOTHUMB-LABEL: ct_i1: +; CORTEX-NOTHUMB: @ %bb.0: @ %entry +; CORTEX-NOTHUMB-NEXT: and r3, r0, #1 +; CORTEX-NOTHUMB-NEXT: rsb r12, r3, #0 +; CORTEX-NOTHUMB-NEXT: and r0, r1, r12 +; CORTEX-NOTHUMB-NEXT: bic r12, r2, r12 +; CORTEX-NOTHUMB-NEXT: orr r0, r0, r12 +; CORTEX-NOTHUMB-NEXT: bx lr +entry: + %sel = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b) + ret i1 %sel +} + +define i8 @ct_int8(i1 %cond, i8 %a, i8 %b) { +; CT-LABEL: ct_int8: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r3, r0, #1 +; CT-NEXT: rsb r12, r3, #0 +; CT-NEXT: and r0, r1, r12 +; CT-NEXT: bic r12, r2, r12 +; CT-NEXT: orr r0, r0, r12 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_int8: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: and r3, r0, #1 +; DEFAULT-NEXT: rsb r12, r3, #0 +; DEFAULT-NEXT: and r0, r1, r12 +; DEFAULT-NEXT: bic r12, r2, r12 +; DEFAULT-NEXT: orr r0, r0, r12 +; DEFAULT-NEXT: bx lr +; +; THUMB1-LABEL: ct_int8: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: movs r3, #1 +; THUMB1-NEXT: ands r3, r0 +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: lsls r4, r4, #31 +; THUMB1-NEXT: asrs r4, r4, #31 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r2 +; THUMB1-NEXT: ands r0, r4 +; THUMB1-NEXT: eors r0, r2 +; THUMB1-NEXT: pop {r4, pc} +; +; THUMB2-LABEL: ct_int8: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: and r3, r0, #1 +; THUMB2-NEXT: rsb.w r12, r3, #0 +; THUMB2-NEXT: and.w r0, r1, r12 +; THUMB2-NEXT: bic.w r12, r2, r12 +; THUMB2-NEXT: orr.w r0, r0, r12 +; THUMB2-NEXT: bx lr +; +; CORTEXA9-LABEL: ct_int8: +; CORTEXA9: @ %bb.0: @ %entry +; CORTEXA9-NEXT: and r3, r0, #1 +; CORTEXA9-NEXT: rsb.w r12, r3, #0 +; CORTEXA9-NEXT: and.w r0, r1, r12 +; CORTEXA9-NEXT: bic.w r12, r2, r12 +; CORTEXA9-NEXT: orr.w r0, r0, r12 +; CORTEXA9-NEXT: bx lr +; +; CORTEX-NOTHUMB-LABEL: ct_int8: +; CORTEX-NOTHUMB: @ %bb.0: @ %entry +; CORTEX-NOTHUMB-NEXT: and r3, r0, #1 +; CORTEX-NOTHUMB-NEXT: rsb r12, r3, #0 +; CORTEX-NOTHUMB-NEXT: and r0, r1, r12 +; CORTEX-NOTHUMB-NEXT: bic r12, r2, r12 +; CORTEX-NOTHUMB-NEXT: orr r0, r0, r12 +; CORTEX-NOTHUMB-NEXT: bx lr +entry: + %sel = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) + ret i8 %sel +} + +define i16 @ct_int16(i1 %cond, i16 %a, i16 %b) { +; CT-LABEL: ct_int16: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r3, r0, #1 +; CT-NEXT: rsb r12, r3, #0 +; CT-NEXT: and r0, r1, r12 +; CT-NEXT: bic r12, r2, r12 +; CT-NEXT: orr r0, r0, r12 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_int16: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: and r3, r0, #1 +; DEFAULT-NEXT: rsb r12, r3, #0 +; DEFAULT-NEXT: and r0, r1, r12 +; DEFAULT-NEXT: bic r12, r2, r12 +; DEFAULT-NEXT: orr r0, r0, r12 +; DEFAULT-NEXT: bx lr +; +; THUMB1-LABEL: ct_int16: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: movs r3, #1 +; THUMB1-NEXT: ands r3, r0 +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: lsls r4, r4, #31 +; THUMB1-NEXT: asrs r4, r4, #31 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r2 +; THUMB1-NEXT: ands r0, r4 +; THUMB1-NEXT: eors r0, r2 +; THUMB1-NEXT: pop {r4, pc} +; +; THUMB2-LABEL: ct_int16: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: and r3, r0, #1 +; THUMB2-NEXT: rsb.w r12, r3, #0 +; THUMB2-NEXT: and.w r0, r1, r12 +; THUMB2-NEXT: bic.w r12, r2, r12 +; THUMB2-NEXT: orr.w r0, r0, r12 +; THUMB2-NEXT: bx lr +; +; CORTEXA9-LABEL: ct_int16: +; CORTEXA9: @ %bb.0: @ %entry +; CORTEXA9-NEXT: and r3, r0, #1 +; CORTEXA9-NEXT: rsb.w r12, r3, #0 +; CORTEXA9-NEXT: and.w r0, r1, r12 +; CORTEXA9-NEXT: bic.w r12, r2, r12 +; CORTEXA9-NEXT: orr.w r0, r0, r12 +; CORTEXA9-NEXT: bx lr +; +; CORTEX-NOTHUMB-LABEL: ct_int16: +; CORTEX-NOTHUMB: @ %bb.0: @ %entry +; CORTEX-NOTHUMB-NEXT: and r3, r0, #1 +; CORTEX-NOTHUMB-NEXT: rsb r12, r3, #0 +; CORTEX-NOTHUMB-NEXT: and r0, r1, r12 +; CORTEX-NOTHUMB-NEXT: bic r12, r2, r12 +; CORTEX-NOTHUMB-NEXT: orr r0, r0, r12 +; CORTEX-NOTHUMB-NEXT: bx lr +entry: + %sel = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) + ret i16 %sel +} + +define i32 @ct_int32(i1 %cond, i32 %a, i32 %b) { +; CT-LABEL: ct_int32: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r3, r0, #1 +; CT-NEXT: rsb r12, r3, #0 +; CT-NEXT: and r0, r1, r12 +; CT-NEXT: bic r12, r2, r12 +; CT-NEXT: orr r0, r0, r12 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_int32: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: and r3, r0, #1 +; DEFAULT-NEXT: rsb r12, r3, #0 +; DEFAULT-NEXT: and r0, r1, r12 +; DEFAULT-NEXT: bic r12, r2, r12 +; DEFAULT-NEXT: orr r0, r0, r12 +; DEFAULT-NEXT: bx lr +; +; THUMB1-LABEL: ct_int32: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: movs r3, #1 +; THUMB1-NEXT: ands r3, r0 +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: lsls r4, r4, #31 +; THUMB1-NEXT: asrs r4, r4, #31 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r2 +; THUMB1-NEXT: ands r0, r4 +; THUMB1-NEXT: eors r0, r2 +; THUMB1-NEXT: pop {r4, pc} +; +; THUMB2-LABEL: ct_int32: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: and r3, r0, #1 +; THUMB2-NEXT: rsb.w r12, r3, #0 +; THUMB2-NEXT: and.w r0, r1, r12 +; THUMB2-NEXT: bic.w r12, r2, r12 +; THUMB2-NEXT: orr.w r0, r0, r12 +; THUMB2-NEXT: bx lr +; +; CORTEXA9-LABEL: ct_int32: +; CORTEXA9: @ %bb.0: @ %entry +; CORTEXA9-NEXT: and r3, r0, #1 +; CORTEXA9-NEXT: rsb.w r12, r3, #0 +; CORTEXA9-NEXT: and.w r0, r1, r12 +; CORTEXA9-NEXT: bic.w r12, r2, r12 +; CORTEXA9-NEXT: orr.w r0, r0, r12 +; CORTEXA9-NEXT: bx lr +; +; CORTEX-NOTHUMB-LABEL: ct_int32: +; CORTEX-NOTHUMB: @ %bb.0: @ %entry +; CORTEX-NOTHUMB-NEXT: and r3, r0, #1 +; CORTEX-NOTHUMB-NEXT: rsb r12, r3, #0 +; CORTEX-NOTHUMB-NEXT: and r0, r1, r12 +; CORTEX-NOTHUMB-NEXT: bic r12, r2, r12 +; CORTEX-NOTHUMB-NEXT: orr r0, r0, r12 +; CORTEX-NOTHUMB-NEXT: bx lr +entry: + %sel = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %sel +} + +define i64 @ct_int64(i1 %cond, i64 %a, i64 %b) { +; CT-LABEL: ct_int64: +; CT: @ %bb.0: @ %entry +; CT-NEXT: .save {r4, lr} +; CT-NEXT: push {r4, lr} +; CT-NEXT: and lr, r0, #1 +; CT-NEXT: ldr r12, [sp, #12] +; CT-NEXT: rsb r4, lr, #0 +; CT-NEXT: ldr r1, [sp, #8] +; CT-NEXT: and r0, r2, r4 +; CT-NEXT: rsb r2, lr, #0 +; CT-NEXT: bic r4, r1, r4 +; CT-NEXT: and r1, r3, r2 +; CT-NEXT: bic r2, r12, r2 +; CT-NEXT: orr r0, r0, r4 +; CT-NEXT: orr r1, r1, r2 +; CT-NEXT: pop {r4, pc} +; +; DEFAULT-LABEL: ct_int64: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: ldr r1, [sp, #8] +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r0, r2, lr +; DEFAULT-NEXT: bic lr, r1, lr +; DEFAULT-NEXT: ldr r2, [sp, #12] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r1, r3, lr +; DEFAULT-NEXT: bic lr, r2, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: pop {r11, pc} +; +; THUMB1-LABEL: ct_int64: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r7, lr} +; THUMB1-NEXT: push {r4, r5, r7, lr} +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #16] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ldr r2, [sp, #20] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: pop {r4, r5, r7, pc} +; +; THUMB2-LABEL: ct_int64: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r7, lr} +; THUMB2-NEXT: push {r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldr r1, [sp, #8] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldr r2, [sp, #12] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: pop {r7, pc} +; +; CORTEXA9-LABEL: ct_int64: +; CORTEXA9: @ %bb.0: @ %entry +; CORTEXA9-NEXT: .save {r4, lr} +; CORTEXA9-NEXT: push {r4, lr} +; CORTEXA9-NEXT: and lr, r0, #1 +; CORTEXA9-NEXT: ldrd r1, r12, [sp, #8] +; CORTEXA9-NEXT: rsb.w r4, lr, #0 +; CORTEXA9-NEXT: and.w r0, r2, r4 +; CORTEXA9-NEXT: rsb.w r2, lr, #0 +; CORTEXA9-NEXT: bic.w r4, r1, r4 +; CORTEXA9-NEXT: and.w r1, r3, r2 +; CORTEXA9-NEXT: bic.w r2, r12, r2 +; CORTEXA9-NEXT: orrs r0, r4 +; CORTEXA9-NEXT: orr.w r1, r1, r2 +; CORTEXA9-NEXT: pop {r4, pc} +; +; CORTEX-NOTHUMB-LABEL: ct_int64: +; CORTEX-NOTHUMB: @ %bb.0: @ %entry +; CORTEX-NOTHUMB-NEXT: .save {r4, lr} +; CORTEX-NOTHUMB-NEXT: push {r4, lr} +; CORTEX-NOTHUMB-NEXT: and lr, r0, #1 +; CORTEX-NOTHUMB-NEXT: ldr r12, [sp, #12] +; CORTEX-NOTHUMB-NEXT: ldr r1, [sp, #8] +; CORTEX-NOTHUMB-NEXT: rsb r4, lr, #0 +; CORTEX-NOTHUMB-NEXT: and r0, r2, r4 +; CORTEX-NOTHUMB-NEXT: rsb r2, lr, #0 +; CORTEX-NOTHUMB-NEXT: bic r4, r1, r4 +; CORTEX-NOTHUMB-NEXT: and r1, r3, r2 +; CORTEX-NOTHUMB-NEXT: bic r2, r12, r2 +; CORTEX-NOTHUMB-NEXT: orr r0, r0, r4 +; CORTEX-NOTHUMB-NEXT: orr r1, r1, r2 +; CORTEX-NOTHUMB-NEXT: pop {r4, pc} +entry: + %sel = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b) + ret i64 %sel +} + +define float @ct_float(i1 %cond, float %a, float %b) { +; CT-LABEL: ct_float: +; CT: @ %bb.0: @ %entry +; CT-NEXT: vmov s0, r2 +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vmov s2, r1 +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vmov r3, s0 +; CT-NEXT: vmov r2, s2 +; CT-NEXT: and r2, r2, r1 +; CT-NEXT: bic r1, r3, r1 +; CT-NEXT: orr r2, r2, r1 +; CT-NEXT: vmov s4, r2 +; CT-NEXT: vmov r0, s4 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_float: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: and r3, r0, #1 +; DEFAULT-NEXT: rsb r12, r3, #0 +; DEFAULT-NEXT: and r0, r1, r12 +; DEFAULT-NEXT: bic r12, r2, r12 +; DEFAULT-NEXT: orr r0, r0, r12 +; DEFAULT-NEXT: bx lr +; +; THUMB1-LABEL: ct_float: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: movs r3, #1 +; THUMB1-NEXT: ands r3, r0 +; THUMB1-NEXT: mov r4, r3 +; THUMB1-NEXT: lsls r4, r4, #31 +; THUMB1-NEXT: asrs r4, r4, #31 +; THUMB1-NEXT: mov r0, r1 +; THUMB1-NEXT: eors r0, r2 +; THUMB1-NEXT: ands r0, r4 +; THUMB1-NEXT: eors r0, r2 +; THUMB1-NEXT: pop {r4, pc} +; +; THUMB2-LABEL: ct_float: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: and r3, r0, #1 +; THUMB2-NEXT: rsb.w r12, r3, #0 +; THUMB2-NEXT: and.w r0, r1, r12 +; THUMB2-NEXT: bic.w r12, r2, r12 +; THUMB2-NEXT: orr.w r0, r0, r12 +; THUMB2-NEXT: bx lr +; +; CORTEXA9-LABEL: ct_float: +; CORTEXA9: @ %bb.0: @ %entry +; CORTEXA9-NEXT: and r0, r0, #1 +; CORTEXA9-NEXT: vmov r2, s0 +; CORTEXA9-NEXT: vmov r3, s1 +; CORTEXA9-NEXT: rsbs r1, r0, #0 +; CORTEXA9-NEXT: ands r2, r1 +; CORTEXA9-NEXT: bic.w r1, r3, r1 +; CORTEXA9-NEXT: orrs r2, r1 +; CORTEXA9-NEXT: vmov s2, r2 +; CORTEXA9-NEXT: vmov.f32 s0, s2 +; CORTEXA9-NEXT: bx lr +; +; CORTEX-NOTHUMB-LABEL: ct_float: +; CORTEX-NOTHUMB: @ %bb.0: @ %entry +; CORTEX-NOTHUMB-NEXT: and r0, r0, #1 +; CORTEX-NOTHUMB-NEXT: vmov r2, s0 +; CORTEX-NOTHUMB-NEXT: vmov r3, s1 +; CORTEX-NOTHUMB-NEXT: rsb r1, r0, #0 +; CORTEX-NOTHUMB-NEXT: and r2, r2, r1 +; CORTEX-NOTHUMB-NEXT: bic r1, r3, r1 +; CORTEX-NOTHUMB-NEXT: orr r2, r2, r1 +; CORTEX-NOTHUMB-NEXT: vmov s2, r2 +; CORTEX-NOTHUMB-NEXT: vmov.f32 s0, s2 +; CORTEX-NOTHUMB-NEXT: bx lr +entry: + %sel = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %sel +} + +define double @ct_f64(i1 %cond, double %a, double %b) { +; CT-LABEL: ct_f64: +; CT: @ %bb.0: @ %entry +; CT-NEXT: and r0, r0, #1 +; CT-NEXT: vldr d16, [sp] +; CT-NEXT: rsb r1, r0, #0 +; CT-NEXT: vmov d17, r2, r3 +; CT-NEXT: vdup.32 d19, r1 +; CT-NEXT: vand d18, d17, d19 +; CT-NEXT: vbic d19, d16, d19 +; CT-NEXT: vorr d18, d18, d19 +; CT-NEXT: vmov r0, r1, d18 +; CT-NEXT: bx lr +; +; DEFAULT-LABEL: ct_f64: +; DEFAULT: @ %bb.0: @ %entry +; DEFAULT-NEXT: push {r11, lr} +; DEFAULT-NEXT: and r12, r0, #1 +; DEFAULT-NEXT: ldr r1, [sp, #8] +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r0, r2, lr +; DEFAULT-NEXT: bic lr, r1, lr +; DEFAULT-NEXT: ldr r2, [sp, #12] +; DEFAULT-NEXT: orr r0, r0, lr +; DEFAULT-NEXT: rsb lr, r12, #0 +; DEFAULT-NEXT: and r1, r3, lr +; DEFAULT-NEXT: bic lr, r2, lr +; DEFAULT-NEXT: orr r1, r1, lr +; DEFAULT-NEXT: pop {r11, pc} +; +; THUMB1-LABEL: ct_f64: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r7, lr} +; THUMB1-NEXT: push {r4, r5, r7, lr} +; THUMB1-NEXT: movs r4, #1 +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: ldr r1, [sp, #16] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ands r0, r5 +; THUMB1-NEXT: eors r0, r1 +; THUMB1-NEXT: ldr r2, [sp, #20] +; THUMB1-NEXT: mov r5, r4 +; THUMB1-NEXT: lsls r5, r5, #31 +; THUMB1-NEXT: asrs r5, r5, #31 +; THUMB1-NEXT: mov r1, r3 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: ands r1, r5 +; THUMB1-NEXT: eors r1, r2 +; THUMB1-NEXT: pop {r4, r5, r7, pc} +; +; THUMB2-LABEL: ct_f64: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r7, lr} +; THUMB2-NEXT: push {r7, lr} +; THUMB2-NEXT: and r12, r0, #1 +; THUMB2-NEXT: ldr r1, [sp, #8] +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r0, r2, lr +; THUMB2-NEXT: bic.w lr, r1, lr +; THUMB2-NEXT: ldr r2, [sp, #12] +; THUMB2-NEXT: orr.w r0, r0, lr +; THUMB2-NEXT: rsb.w lr, r12, #0 +; THUMB2-NEXT: and.w r1, r3, lr +; THUMB2-NEXT: bic.w lr, r2, lr +; THUMB2-NEXT: orr.w r1, r1, lr +; THUMB2-NEXT: pop {r7, pc} +; +; CORTEXA9-LABEL: ct_f64: +; CORTEXA9: @ %bb.0: @ %entry +; CORTEXA9-NEXT: and r0, r0, #1 +; CORTEXA9-NEXT: rsbs r1, r0, #0 +; CORTEXA9-NEXT: vdup.32 d17, r1 +; CORTEXA9-NEXT: vand d16, d0, d17 +; CORTEXA9-NEXT: vbic d17, d1, d17 +; CORTEXA9-NEXT: vorr d16, d16, d17 +; CORTEXA9-NEXT: vorr d0, d16, d16 +; CORTEXA9-NEXT: bx lr +; +; CORTEX-NOTHUMB-LABEL: ct_f64: +; CORTEX-NOTHUMB: @ %bb.0: @ %entry +; CORTEX-NOTHUMB-NEXT: and r0, r0, #1 +; CORTEX-NOTHUMB-NEXT: rsb r1, r0, #0 +; CORTEX-NOTHUMB-NEXT: vdup.32 d17, r1 +; CORTEX-NOTHUMB-NEXT: vand d16, d0, d17 +; CORTEX-NOTHUMB-NEXT: vbic d17, d1, d17 +; CORTEX-NOTHUMB-NEXT: vorr d16, d16, d17 +; CORTEX-NOTHUMB-NEXT: vorr d0, d16, d16 +; CORTEX-NOTHUMB-NEXT: bx lr +entry: + %sel = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) + ret double %sel +} diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll new file mode 100644 index 0000000000000..42f460f2c598f --- /dev/null +++ b/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll @@ -0,0 +1,451 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=mipsel-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M32 +; RUN: llc < %s -mtriple=mips64el-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M64 + +; Portable edge case tests + +; Test with small integer types +define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) { +; M32-LABEL: test_ctselect_i1: +; M32: # %bb.0: +; M32-NEXT: xori $2, $4, 1 +; M32-NEXT: and $1, $4, $5 +; M32-NEXT: and $2, $2, $6 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_i1: +; M64: # %bb.0: +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sll $1, $6, 0 +; M64-NEXT: xori $2, $2, 1 +; M64-NEXT: and $1, $2, $1 +; M64-NEXT: and $2, $4, $5 +; M64-NEXT: sll $2, $2, 0 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b) + ret i1 %result +} + +; Test with extremal values +define i32 @test_ctselect_extremal_values(i1 %cond) { +; M32-LABEL: test_ctselect_extremal_values: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: lui $3, 32768 +; M32-NEXT: addiu $2, $1, -1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: lui $3, 32767 +; M32-NEXT: ori $3, $3, 65535 +; M32-NEXT: and $1, $1, $3 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_extremal_values: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: lui $3, 32768 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: addiu $2, $1, -1 +; M64-NEXT: negu $1, $1 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: lui $3, 32767 +; M64-NEXT: ori $3, $3, 65535 +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $1, $2 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648) + ret i32 %result +} + +; Test with null pointers +define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { +; M32-LABEL: test_ctselect_null_ptr: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: jr $ra +; M32-NEXT: and $2, $1, $5 +; +; M64-LABEL: test_ctselect_null_ptr: +; M64: # %bb.0: +; M64-NEXT: andi $1, $4, 1 +; M64-NEXT: dnegu $1, $1 +; M64-NEXT: jr $ra +; M64-NEXT: and $2, $1, $5 + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null) + ret ptr %result +} + +; Test with function pointers +define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) { +; M32-LABEL: test_ctselect_function_ptr: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: addiu $2, $1, -1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $2, $6 +; M32-NEXT: and $1, $1, $5 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_function_ptr: +; M64: # %bb.0: +; M64-NEXT: andi $1, $4, 1 +; M64-NEXT: daddiu $2, $1, -1 +; M64-NEXT: dnegu $1, $1 +; M64-NEXT: and $2, $2, $6 +; M64-NEXT: and $1, $1, $5 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $1, $2 + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2) + ret ptr %result +} + +; Test with condition from icmp on pointers +define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) { +; M32-LABEL: test_ctselect_ptr_cmp: +; M32: # %bb.0: +; M32-NEXT: xor $1, $4, $5 +; M32-NEXT: sltu $1, $zero, $1 +; M32-NEXT: negu $2, $1 +; M32-NEXT: addiu $1, $1, -1 +; M32-NEXT: and $2, $2, $7 +; M32-NEXT: and $1, $1, $6 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_ptr_cmp: +; M64: # %bb.0: +; M64-NEXT: xor $1, $4, $5 +; M64-NEXT: daddiu $3, $zero, -1 +; M64-NEXT: daddiu $2, $zero, -1 +; M64-NEXT: movn $3, $zero, $1 +; M64-NEXT: xor $2, $3, $2 +; M64-NEXT: and $1, $3, $6 +; M64-NEXT: and $2, $2, $7 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $1, $2 + %cmp = icmp eq ptr %p1, %p2 + %result = call ptr @llvm.ct.select.p0(i1 %cmp, ptr %a, ptr %b) + ret ptr %result +} + +; Test with struct pointer types +%struct.pair = type { i32, i32 } + +define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) { +; M32-LABEL: test_ctselect_struct_ptr: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: addiu $2, $1, -1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $2, $6 +; M32-NEXT: and $1, $1, $5 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_struct_ptr: +; M64: # %bb.0: +; M64-NEXT: andi $1, $4, 1 +; M64-NEXT: daddiu $2, $1, -1 +; M64-NEXT: dnegu $1, $1 +; M64-NEXT: and $2, $2, $6 +; M64-NEXT: and $1, $1, $5 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $1, $2 + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) + ret ptr %result +} + +; Test with deeply nested conditions +define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { +; M32-LABEL: test_ctselect_deeply_nested: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: lw $3, 20($sp) +; M32-NEXT: addiu $2, $1, -1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: lw $3, 16($sp) +; M32-NEXT: and $1, $1, $3 +; M32-NEXT: or $1, $1, $2 +; M32-NEXT: andi $2, $5, 1 +; M32-NEXT: negu $3, $2 +; M32-NEXT: addiu $2, $2, -1 +; M32-NEXT: and $1, $3, $1 +; M32-NEXT: lw $3, 24($sp) +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: andi $3, $7, 1 +; M32-NEXT: or $1, $1, $2 +; M32-NEXT: andi $2, $6, 1 +; M32-NEXT: lw $6, 32($sp) +; M32-NEXT: negu $4, $3 +; M32-NEXT: addiu $3, $3, -1 +; M32-NEXT: negu $5, $2 +; M32-NEXT: addiu $2, $2, -1 +; M32-NEXT: and $1, $5, $1 +; M32-NEXT: lw $5, 28($sp) +; M32-NEXT: and $2, $2, $5 +; M32-NEXT: or $1, $1, $2 +; M32-NEXT: and $2, $3, $6 +; M32-NEXT: and $1, $4, $1 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_deeply_nested: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: sll $4, $9, 0 +; M64-NEXT: sll $3, $8, 0 +; M64-NEXT: sll $8, $11, 0 +; M64-NEXT: lw $9, 0($sp) +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: negu $2, $1 +; M64-NEXT: addiu $1, $1, -1 +; M64-NEXT: and $1, $1, $4 +; M64-NEXT: sll $4, $5, 0 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: sll $3, $6, 0 +; M64-NEXT: sll $5, $7, 0 +; M64-NEXT: andi $4, $4, 1 +; M64-NEXT: or $1, $2, $1 +; M64-NEXT: andi $3, $3, 1 +; M64-NEXT: andi $5, $5, 1 +; M64-NEXT: negu $2, $4 +; M64-NEXT: addiu $4, $4, -1 +; M64-NEXT: negu $7, $3 +; M64-NEXT: negu $6, $5 +; M64-NEXT: addiu $5, $5, -1 +; M64-NEXT: and $1, $2, $1 +; M64-NEXT: sll $2, $10, 0 +; M64-NEXT: and $2, $4, $2 +; M64-NEXT: or $1, $1, $2 +; M64-NEXT: addiu $2, $3, -1 +; M64-NEXT: and $1, $7, $1 +; M64-NEXT: and $2, $2, $8 +; M64-NEXT: or $1, $1, $2 +; M64-NEXT: and $2, $5, $9 +; M64-NEXT: and $1, $6, $1 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $1, $2 + %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) + %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c) + %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d) + %sel4 = call i32 @llvm.ct.select.i32(i1 %c4, i32 %sel3, i32 %e) + ret i32 %sel4 +} + + ; This test demonstrates the FStar cmovznz4 pattern using ct.select +; Based on https://godbolt.org/z/6Kb71Ks7z +; Shows that NoMerge flag prevents DAG optimization from introducing branches +define void @cmovznz4_fstar_original(i64 %cin, ptr %x, ptr %y, ptr %r) { +; M32-LABEL: cmovznz4_fstar_original: +; M32: # %bb.0: # %entry +; M32-NEXT: or $1, $4, $5 +; M32-NEXT: addiu $2, $7, 16 +; M32-NEXT: addiu $3, $6, 16 +; M32-NEXT: addiu $4, $6, 8 +; M32-NEXT: movz $2, $3, $1 +; M32-NEXT: addiu $3, $7, 8 +; M32-NEXT: movz $3, $4, $1 +; M32-NEXT: addiu $4, $7, 24 +; M32-NEXT: movz $7, $6, $1 +; M32-NEXT: addiu $6, $6, 24 +; M32-NEXT: lw $9, 4($2) +; M32-NEXT: lw $2, 0($2) +; M32-NEXT: movz $4, $6, $1 +; M32-NEXT: lw $5, 4($7) +; M32-NEXT: lw $8, 4($3) +; M32-NEXT: lw $7, 0($7) +; M32-NEXT: lw $3, 0($3) +; M32-NEXT: lw $6, 16($sp) +; M32-NEXT: lw $1, 4($4) +; M32-NEXT: lw $4, 0($4) +; M32-NEXT: sw $4, 24($6) +; M32-NEXT: sw $1, 28($6) +; M32-NEXT: sw $2, 16($6) +; M32-NEXT: sw $9, 20($6) +; M32-NEXT: sw $3, 8($6) +; M32-NEXT: sw $8, 12($6) +; M32-NEXT: sw $7, 0($6) +; M32-NEXT: jr $ra +; M32-NEXT: sw $5, 4($6) +; +; M64-LABEL: cmovznz4_fstar_original: +; M64: # %bb.0: # %entry +; M64-NEXT: daddiu $1, $6, 8 +; M64-NEXT: daddiu $2, $5, 8 +; M64-NEXT: daddiu $3, $6, 16 +; M64-NEXT: daddiu $8, $5, 16 +; M64-NEXT: movz $1, $2, $4 +; M64-NEXT: move $2, $6 +; M64-NEXT: daddiu $6, $6, 24 +; M64-NEXT: movz $3, $8, $4 +; M64-NEXT: movz $2, $5, $4 +; M64-NEXT: daddiu $5, $5, 24 +; M64-NEXT: ld $1, 0($1) +; M64-NEXT: ld $3, 0($3) +; M64-NEXT: movz $6, $5, $4 +; M64-NEXT: ld $2, 0($2) +; M64-NEXT: ld $4, 0($6) +; M64-NEXT: sd $4, 24($7) +; M64-NEXT: sd $3, 16($7) +; M64-NEXT: sd $1, 8($7) +; M64-NEXT: jr $ra +; M64-NEXT: sd $2, 0($7) +entry: + %.not.i = icmp eq i64 %cin, 0 + %0 = load i64, ptr %y, align 8 + %1 = load i64, ptr %x, align 8 + %or = select i1 %.not.i, i64 %1, i64 %0 + %arrayidx4 = getelementptr inbounds nuw i8, ptr %y, i64 8 + %2 = load i64, ptr %arrayidx4, align 8 + %arrayidx6 = getelementptr inbounds nuw i8, ptr %x, i64 8 + %3 = load i64, ptr %arrayidx6, align 8 + %or9 = select i1 %.not.i, i64 %3, i64 %2 + %arrayidx10 = getelementptr inbounds nuw i8, ptr %y, i64 16 + %4 = load i64, ptr %arrayidx10, align 8 + %arrayidx12 = getelementptr inbounds nuw i8, ptr %x, i64 16 + %5 = load i64, ptr %arrayidx12, align 8 + %or15 = select i1 %.not.i, i64 %5, i64 %4 + %arrayidx16 = getelementptr inbounds nuw i8, ptr %y, i64 24 + %6 = load i64, ptr %arrayidx16, align 8 + %arrayidx18 = getelementptr inbounds nuw i8, ptr %x, i64 24 + %7 = load i64, ptr %arrayidx18, align 8 + %or21 = select i1 %.not.i, i64 %7, i64 %6 + store i64 %or, ptr %r, align 8 + %arrayidx23 = getelementptr inbounds nuw i8, ptr %r, i64 8 + store i64 %or9, ptr %arrayidx23, align 8 + %arrayidx24 = getelementptr inbounds nuw i8, ptr %r, i64 16 + store i64 %or15, ptr %arrayidx24, align 8 + %arrayidx25 = getelementptr inbounds nuw i8, ptr %r, i64 24 + store i64 %or21, ptr %arrayidx25, align 8 + ret void +} + +define void @cmovznz4_builtin_ctselect(i64 %cin, ptr %x, ptr %y, ptr %r) { +; M32-LABEL: cmovznz4_builtin_ctselect: +; M32: # %bb.0: # %entry +; M32-NEXT: or $1, $4, $5 +; M32-NEXT: lw $3, 4($7) +; M32-NEXT: lw $4, 4($6) +; M32-NEXT: sltu $1, $zero, $1 +; M32-NEXT: negu $2, $1 +; M32-NEXT: addiu $1, $1, -1 +; M32-NEXT: and $3, $2, $3 +; M32-NEXT: and $4, $1, $4 +; M32-NEXT: or $3, $4, $3 +; M32-NEXT: lw $4, 16($sp) +; M32-NEXT: sw $3, 4($4) +; M32-NEXT: lw $3, 0($7) +; M32-NEXT: lw $5, 0($6) +; M32-NEXT: and $3, $2, $3 +; M32-NEXT: and $5, $1, $5 +; M32-NEXT: or $3, $5, $3 +; M32-NEXT: sw $3, 0($4) +; M32-NEXT: lw $3, 12($7) +; M32-NEXT: lw $5, 12($6) +; M32-NEXT: and $3, $2, $3 +; M32-NEXT: and $5, $1, $5 +; M32-NEXT: or $3, $5, $3 +; M32-NEXT: sw $3, 12($4) +; M32-NEXT: lw $3, 8($7) +; M32-NEXT: lw $5, 8($6) +; M32-NEXT: and $3, $2, $3 +; M32-NEXT: and $5, $1, $5 +; M32-NEXT: or $3, $5, $3 +; M32-NEXT: sw $3, 8($4) +; M32-NEXT: lw $3, 20($7) +; M32-NEXT: lw $5, 20($6) +; M32-NEXT: and $3, $2, $3 +; M32-NEXT: and $5, $1, $5 +; M32-NEXT: or $3, $5, $3 +; M32-NEXT: sw $3, 20($4) +; M32-NEXT: lw $3, 16($7) +; M32-NEXT: lw $5, 16($6) +; M32-NEXT: and $3, $2, $3 +; M32-NEXT: and $5, $1, $5 +; M32-NEXT: or $3, $5, $3 +; M32-NEXT: sw $3, 16($4) +; M32-NEXT: lw $3, 28($7) +; M32-NEXT: lw $5, 28($6) +; M32-NEXT: and $3, $2, $3 +; M32-NEXT: and $5, $1, $5 +; M32-NEXT: or $3, $5, $3 +; M32-NEXT: sw $3, 28($4) +; M32-NEXT: lw $3, 24($7) +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: lw $3, 24($6) +; M32-NEXT: and $1, $1, $3 +; M32-NEXT: or $1, $1, $2 +; M32-NEXT: jr $ra +; M32-NEXT: sw $1, 24($4) +; +; M64-LABEL: cmovznz4_builtin_ctselect: +; M64: # %bb.0: # %entry +; M64-NEXT: daddiu $2, $zero, -1 +; M64-NEXT: daddiu $1, $zero, -1 +; M64-NEXT: ld $3, 0($5) +; M64-NEXT: movn $2, $zero, $4 +; M64-NEXT: ld $4, 0($6) +; M64-NEXT: xor $1, $2, $1 +; M64-NEXT: and $3, $2, $3 +; M64-NEXT: and $4, $1, $4 +; M64-NEXT: or $3, $3, $4 +; M64-NEXT: sd $3, 0($7) +; M64-NEXT: ld $3, 8($6) +; M64-NEXT: ld $4, 8($5) +; M64-NEXT: and $3, $1, $3 +; M64-NEXT: and $4, $2, $4 +; M64-NEXT: or $3, $4, $3 +; M64-NEXT: sd $3, 8($7) +; M64-NEXT: ld $3, 16($6) +; M64-NEXT: ld $4, 16($5) +; M64-NEXT: and $3, $1, $3 +; M64-NEXT: and $4, $2, $4 +; M64-NEXT: or $3, $4, $3 +; M64-NEXT: sd $3, 16($7) +; M64-NEXT: ld $3, 24($6) +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: ld $3, 24($5) +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: or $1, $2, $1 +; M64-NEXT: jr $ra +; M64-NEXT: sd $1, 24($7) +entry: + %cmp = icmp eq i64 %cin, 0 + %0 = load i64, ptr %x, align 8 + %1 = load i64, ptr %y, align 8 + %2 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %0, i64 %1) + store i64 %2, ptr %r, align 8 + %arrayidx4 = getelementptr inbounds nuw i8, ptr %x, i64 8 + %3 = load i64, ptr %arrayidx4, align 8 + %arrayidx5 = getelementptr inbounds nuw i8, ptr %y, i64 8 + %4 = load i64, ptr %arrayidx5, align 8 + %5 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %3, i64 %4) + %arrayidx6 = getelementptr inbounds nuw i8, ptr %r, i64 8 + store i64 %5, ptr %arrayidx6, align 8 + %arrayidx8 = getelementptr inbounds nuw i8, ptr %x, i64 16 + %6 = load i64, ptr %arrayidx8, align 8 + %arrayidx9 = getelementptr inbounds nuw i8, ptr %y, i64 16 + %7 = load i64, ptr %arrayidx9, align 8 + %8 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %6, i64 %7) + %arrayidx10 = getelementptr inbounds nuw i8, ptr %r, i64 16 + store i64 %8, ptr %arrayidx10, align 8 + %arrayidx12 = getelementptr inbounds nuw i8, ptr %x, i64 24 + %9 = load i64, ptr %arrayidx12, align 8 + %arrayidx13 = getelementptr inbounds nuw i8, ptr %y, i64 24 + %10 = load i64, ptr %arrayidx13, align 8 + %11 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %9, i64 %10) + %arrayidx14 = getelementptr inbounds nuw i8, ptr %r, i64 24 + store i64 %11, ptr %arrayidx14, align 8 + ret void +} + +; Declare the intrinsics +declare i1 @llvm.ct.select.i1(i1, i1, i1) +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare ptr @llvm.ct.select.p0(i1, ptr, ptr) diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll b/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll new file mode 100644 index 0000000000000..8fc1af159ec17 --- /dev/null +++ b/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll @@ -0,0 +1,413 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=mipsel-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M32 +; RUN: llc < %s -mtriple=mips64el-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M64 + +; Test smin(x, 0) pattern +define i32 @test_ctselect_smin_zero(i32 %x) { +; M32-LABEL: test_ctselect_smin_zero: +; M32: # %bb.0: +; M32-NEXT: sra $1, $4, 31 +; M32-NEXT: jr $ra +; M32-NEXT: and $2, $1, $4 +; +; M64-LABEL: test_ctselect_smin_zero: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: sra $2, $1, 31 +; M64-NEXT: jr $ra +; M64-NEXT: and $2, $2, $1 + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) + ret i32 %result +} + +; Test smax(x, 0) pattern +define i32 @test_ctselect_smax_zero(i32 %x) { +; M32-LABEL: test_ctselect_smax_zero: +; M32: # %bb.0: +; M32-NEXT: slti $1, $4, 1 +; M32-NEXT: movn $4, $zero, $1 +; M32-NEXT: jr $ra +; M32-NEXT: move $2, $4 +; +; M64-LABEL: test_ctselect_smax_zero: +; M64: # %bb.0: +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: slti $1, $2, 1 +; M64-NEXT: jr $ra +; M64-NEXT: movn $2, $zero, $1 + %cmp = icmp sgt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) + ret i32 %result +} + +; Test generic smin pattern +define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) { +; M32-LABEL: test_ctselect_smin_generic: +; M32: # %bb.0: +; M32-NEXT: slt $1, $4, $5 +; M32-NEXT: xori $1, $1, 1 +; M32-NEXT: negu $2, $1 +; M32-NEXT: addiu $1, $1, -1 +; M32-NEXT: and $2, $2, $5 +; M32-NEXT: and $1, $1, $4 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_smin_generic: +; M64: # %bb.0: +; M64-NEXT: sll $1, $5, 0 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: slt $3, $2, $1 +; M64-NEXT: xori $3, $3, 1 +; M64-NEXT: negu $4, $3 +; M64-NEXT: addiu $3, $3, -1 +; M64-NEXT: and $1, $4, $1 +; M64-NEXT: and $2, $3, $2 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %cmp = icmp slt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test generic smax pattern +define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) { +; M32-LABEL: test_ctselect_smax_generic: +; M32: # %bb.0: +; M32-NEXT: slt $1, $5, $4 +; M32-NEXT: xori $1, $1, 1 +; M32-NEXT: negu $2, $1 +; M32-NEXT: addiu $1, $1, -1 +; M32-NEXT: and $2, $2, $5 +; M32-NEXT: and $1, $1, $4 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_smax_generic: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: sll $2, $5, 0 +; M64-NEXT: slt $3, $2, $1 +; M64-NEXT: xori $3, $3, 1 +; M64-NEXT: negu $4, $3 +; M64-NEXT: addiu $3, $3, -1 +; M64-NEXT: and $2, $4, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $1, $2 + %cmp = icmp sgt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test umin pattern +define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) { +; M32-LABEL: test_ctselect_umin_generic: +; M32: # %bb.0: +; M32-NEXT: sltu $1, $4, $5 +; M32-NEXT: xori $1, $1, 1 +; M32-NEXT: negu $2, $1 +; M32-NEXT: addiu $1, $1, -1 +; M32-NEXT: and $2, $2, $5 +; M32-NEXT: and $1, $1, $4 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_umin_generic: +; M64: # %bb.0: +; M64-NEXT: sll $1, $5, 0 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sltu $3, $2, $1 +; M64-NEXT: xori $3, $3, 1 +; M64-NEXT: negu $4, $3 +; M64-NEXT: addiu $3, $3, -1 +; M64-NEXT: and $1, $4, $1 +; M64-NEXT: and $2, $3, $2 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %cmp = icmp ult i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test umax pattern +define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) { +; M32-LABEL: test_ctselect_umax_generic: +; M32: # %bb.0: +; M32-NEXT: sltu $1, $5, $4 +; M32-NEXT: xori $1, $1, 1 +; M32-NEXT: negu $2, $1 +; M32-NEXT: addiu $1, $1, -1 +; M32-NEXT: and $2, $2, $5 +; M32-NEXT: and $1, $1, $4 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_umax_generic: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: sll $2, $5, 0 +; M64-NEXT: sltu $3, $2, $1 +; M64-NEXT: xori $3, $3, 1 +; M64-NEXT: negu $4, $3 +; M64-NEXT: addiu $3, $3, -1 +; M64-NEXT: and $2, $4, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $1, $2 + %cmp = icmp ugt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test abs pattern +define i32 @test_ctselect_abs(i32 %x) { +; M32-LABEL: test_ctselect_abs: +; M32: # %bb.0: +; M32-NEXT: negu $1, $4 +; M32-NEXT: sra $2, $4, 31 +; M32-NEXT: and $1, $2, $1 +; M32-NEXT: not $2, $2 +; M32-NEXT: and $2, $2, $4 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_abs: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: negu $2, $1 +; M64-NEXT: sra $3, $1, 31 +; M64-NEXT: and $2, $3, $2 +; M64-NEXT: not $3, $3 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %neg = sub i32 0, %x + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %neg, i32 %x) + ret i32 %result +} + +; Test nabs pattern (negative abs) +define i32 @test_ctselect_nabs(i32 %x) { +; M32-LABEL: test_ctselect_nabs: +; M32: # %bb.0: +; M32-NEXT: sra $1, $4, 31 +; M32-NEXT: negu $3, $4 +; M32-NEXT: and $2, $1, $4 +; M32-NEXT: not $1, $1 +; M32-NEXT: and $1, $1, $3 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $2, $1 +; +; M64-LABEL: test_ctselect_nabs: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: sra $2, $1, 31 +; M64-NEXT: and $3, $2, $1 +; M64-NEXT: negu $1, $1 +; M64-NEXT: not $2, $2 +; M64-NEXT: and $1, $2, $1 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $3, $1 + %neg = sub i32 0, %x + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %neg) + ret i32 %result +} + +; Test sign extension pattern +define i32 @test_ctselect_sign_extend(i32 %x) { +; M32-LABEL: test_ctselect_sign_extend: +; M32: # %bb.0: +; M32-NEXT: jr $ra +; M32-NEXT: sra $2, $4, 31 +; +; M64-LABEL: test_ctselect_sign_extend: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: jr $ra +; M64-NEXT: sra $2, $1, 31 + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0) + ret i32 %result +} + +; Test zero extension pattern +define i32 @test_ctselect_zero_extend(i32 %x) { +; M32-LABEL: test_ctselect_zero_extend: +; M32: # %bb.0: +; M32-NEXT: jr $ra +; M32-NEXT: sltu $2, $zero, $4 +; +; M64-LABEL: test_ctselect_zero_extend: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: jr $ra +; M64-NEXT: sltu $2, $zero, $1 + %cmp = icmp ne i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0) + ret i32 %result +} + +; Test constant folding with known condition +define i32 @test_ctselect_constant_folding_true(i32 %a, i32 %b) { +; M32-LABEL: test_ctselect_constant_folding_true: +; M32: # %bb.0: +; M32-NEXT: jr $ra +; M32-NEXT: move $2, $4 +; +; M64-LABEL: test_ctselect_constant_folding_true: +; M64: # %bb.0: +; M64-NEXT: jr $ra +; M64-NEXT: sll $2, $4, 0 + %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) { +; M32-LABEL: test_ctselect_constant_folding_false: +; M32: # %bb.0: +; M32-NEXT: jr $ra +; M32-NEXT: move $2, $5 +; +; M64-LABEL: test_ctselect_constant_folding_false: +; M64: # %bb.0: +; M64-NEXT: jr $ra +; M64-NEXT: sll $2, $5, 0 + %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) + ret i32 %result +} + +; Test with identical operands +define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) { +; M32-LABEL: test_ctselect_identical_operands: +; M32: # %bb.0: +; M32-NEXT: jr $ra +; M32-NEXT: move $2, $5 +; +; M64-LABEL: test_ctselect_identical_operands: +; M64: # %bb.0: +; M64-NEXT: jr $ra +; M64-NEXT: sll $2, $5, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x) + ret i32 %result +} + +; Test with inverted condition +define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) { +; M32-LABEL: test_ctselect_inverted_condition: +; M32: # %bb.0: +; M32-NEXT: xor $1, $4, $5 +; M32-NEXT: sltiu $1, $1, 1 +; M32-NEXT: negu $2, $1 +; M32-NEXT: addiu $1, $1, -1 +; M32-NEXT: and $2, $2, $7 +; M32-NEXT: and $1, $1, $6 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_inverted_condition: +; M64: # %bb.0: +; M64-NEXT: sll $1, $5, 0 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sll $3, $7, 0 +; M64-NEXT: xor $1, $2, $1 +; M64-NEXT: sltiu $1, $1, 1 +; M64-NEXT: negu $2, $1 +; M64-NEXT: addiu $1, $1, -1 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: sll $3, $6, 0 +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $1, $2 + %cmp = icmp eq i32 %x, %y + %not_cmp = xor i1 %cmp, true + %result = call i32 @llvm.ct.select.i32(i1 %not_cmp, i32 %a, i32 %b) + ret i32 %result +} + +; Test chain of ct.select operations +define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, i32 %d) { +; M32-LABEL: test_ctselect_chain: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: lw $3, 16($sp) +; M32-NEXT: addiu $2, $1, -1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: and $1, $1, $7 +; M32-NEXT: or $1, $1, $2 +; M32-NEXT: andi $2, $5, 1 +; M32-NEXT: negu $3, $2 +; M32-NEXT: addiu $2, $2, -1 +; M32-NEXT: and $1, $3, $1 +; M32-NEXT: lw $3, 20($sp) +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: or $1, $1, $2 +; M32-NEXT: andi $2, $6, 1 +; M32-NEXT: negu $3, $2 +; M32-NEXT: addiu $2, $2, -1 +; M32-NEXT: and $1, $3, $1 +; M32-NEXT: lw $3, 24($sp) +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_chain: +; M64: # %bb.0: +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sll $1, $8, 0 +; M64-NEXT: sll $4, $10, 0 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: addiu $3, $2, -1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: sll $3, $7, 0 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: or $1, $2, $1 +; M64-NEXT: sll $2, $5, 0 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: negu $3, $2 +; M64-NEXT: addiu $2, $2, -1 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: sll $3, $9, 0 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: or $1, $1, $2 +; M64-NEXT: sll $2, $6, 0 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: negu $3, $2 +; M64-NEXT: addiu $2, $2, -1 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: and $2, $2, $4 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $1, $2 + %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) + %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c) + %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d) + ret i32 %sel3 +} + +; Test for 64-bit operations (supported on all 64-bit architectures) +define i64 @test_ctselect_i64_smin_zero(i64 %x) { +; M32-LABEL: test_ctselect_i64_smin_zero: +; M32: # %bb.0: +; M32-NEXT: sra $1, $5, 31 +; M32-NEXT: and $2, $1, $4 +; M32-NEXT: jr $ra +; M32-NEXT: and $3, $1, $5 +; +; M64-LABEL: test_ctselect_i64_smin_zero: +; M64: # %bb.0: +; M64-NEXT: dsra $1, $4, 63 +; M64-NEXT: jr $ra +; M64-NEXT: and $2, $1, $4 + %cmp = icmp slt i64 %x, 0 + %result = call i64 @llvm.ct.select.i64(i1 %cmp, i64 %x, i64 0) + ret i64 %result +} + +; Declare the intrinsics +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare i64 @llvm.ct.select.i64(i1, i64, i64) diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback-vector.ll b/llvm/test/CodeGen/Mips/ctselect-fallback-vector.ll new file mode 100644 index 0000000000000..1e18a87ea6605 --- /dev/null +++ b/llvm/test/CodeGen/Mips/ctselect-fallback-vector.ll @@ -0,0 +1,712 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=mips64-unknown-linux-gnu -mcpu=mips64r6 -mattr=+msa -O3 | FileCheck %s --check-prefix=MIPS64-MSA +; RUN: llc < %s -mtriple=mips-unknown-linux-gnu -mcpu=mips32r6 -mattr=+msa -O3 | FileCheck %s --check-prefix=MIPS32-MSA + +; Test 32-bit integer vector (128 bits) +define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { +; MIPS64-MSA-LABEL: test_ctselect_v4i32: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: insert.d $w0[0], $7 +; MIPS64-MSA-NEXT: insert.d $w1[0], $5 +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: fill.w $w2, $1 +; MIPS64-MSA-NEXT: insert.d $w0[1], $8 +; MIPS64-MSA-NEXT: insert.d $w1[1], $6 +; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177 +; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS64-MSA-NEXT: shf.w $w0, $w2, 177 +; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w0[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v4i32: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: lw $2, 24($sp) +; MIPS32-MSA-NEXT: insert.w $w1[0], $6 +; MIPS32-MSA-NEXT: lw $1, 28($sp) +; MIPS32-MSA-NEXT: fill.w $w2, $4 +; MIPS32-MSA-NEXT: insert.w $w0[0], $2 +; MIPS32-MSA-NEXT: insert.w $w1[1], $7 +; MIPS32-MSA-NEXT: lw $2, 20($sp) +; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: insert.w $w0[1], $1 +; MIPS32-MSA-NEXT: lw $1, 16($sp) +; MIPS32-MSA-NEXT: insert.w $w1[2], $1 +; MIPS32-MSA-NEXT: lw $1, 32($sp) +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 36($sp) +; MIPS32-MSA-NEXT: insert.w $w1[3], $2 +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS32-MSA-NEXT: copy_s.w $2, $w2[0] +; MIPS32-MSA-NEXT: copy_s.w $3, $w2[1] +; MIPS32-MSA-NEXT: copy_s.w $4, $w2[2] +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: copy_s.w $5, $w2[3] + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +; Test 16-bit integer vector (8 x i16 = 128-bit) +define <8 x i16> @test_ctselect_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) { +; MIPS64-MSA-LABEL: test_ctselect_v8i16: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: insert.d $w0[0], $7 +; MIPS64-MSA-NEXT: insert.d $w1[0], $5 +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: fill.h $w2, $1 +; MIPS64-MSA-NEXT: insert.d $w0[1], $8 +; MIPS64-MSA-NEXT: insert.d $w1[1], $6 +; MIPS64-MSA-NEXT: slli.h $w2, $w2, 15 +; MIPS64-MSA-NEXT: shf.h $w0, $w0, 27 +; MIPS64-MSA-NEXT: shf.h $w1, $w1, 27 +; MIPS64-MSA-NEXT: srai.h $w2, $w2, 15 +; MIPS64-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS64-MSA-NEXT: shf.h $w0, $w2, 27 +; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w0[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v8i16: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: lw $2, 24($sp) +; MIPS32-MSA-NEXT: insert.w $w0[0], $6 +; MIPS32-MSA-NEXT: lw $1, 28($sp) +; MIPS32-MSA-NEXT: fill.h $w2, $4 +; MIPS32-MSA-NEXT: insert.w $w1[0], $2 +; MIPS32-MSA-NEXT: insert.w $w0[1], $7 +; MIPS32-MSA-NEXT: lw $2, 32($sp) +; MIPS32-MSA-NEXT: slli.h $w2, $w2, 15 +; MIPS32-MSA-NEXT: srai.h $w2, $w2, 15 +; MIPS32-MSA-NEXT: insert.w $w1[1], $1 +; MIPS32-MSA-NEXT: lw $1, 16($sp) +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 20($sp) +; MIPS32-MSA-NEXT: insert.w $w1[2], $2 +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: lw $1, 36($sp) +; MIPS32-MSA-NEXT: insert.w $w1[3], $1 +; MIPS32-MSA-NEXT: shf.h $w0, $w0, 177 +; MIPS32-MSA-NEXT: shf.h $w1, $w1, 177 +; MIPS32-MSA-NEXT: bsel.v $w2, $w1, $w0 +; MIPS32-MSA-NEXT: shf.h $w0, $w2, 177 +; MIPS32-MSA-NEXT: copy_s.w $2, $w0[0] +; MIPS32-MSA-NEXT: copy_s.w $3, $w0[1] +; MIPS32-MSA-NEXT: copy_s.w $4, $w0[2] +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: copy_s.w $5, $w0[3] + %result = call <8 x i16> @llvm.ct.select.v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %result +} + +; Test byte vector (16 x i8 = 128-bit) +define <16 x i8> @test_ctselect_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) { +; MIPS64-MSA-LABEL: test_ctselect_v16i8: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: insert.d $w0[0], $7 +; MIPS64-MSA-NEXT: insert.d $w1[0], $5 +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: fill.b $w2, $1 +; MIPS64-MSA-NEXT: insert.d $w0[1], $8 +; MIPS64-MSA-NEXT: insert.d $w1[1], $6 +; MIPS64-MSA-NEXT: slli.b $w2, $w2, 7 +; MIPS64-MSA-NEXT: shf.b $w0, $w0, 27 +; MIPS64-MSA-NEXT: shf.b $w1, $w1, 27 +; MIPS64-MSA-NEXT: srai.b $w2, $w2, 7 +; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: bmnz.v $w0, $w1, $w2 +; MIPS64-MSA-NEXT: shf.b $w0, $w0, 27 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w0[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v16i8: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: lw $2, 24($sp) +; MIPS32-MSA-NEXT: insert.w $w0[0], $6 +; MIPS32-MSA-NEXT: lw $1, 28($sp) +; MIPS32-MSA-NEXT: fill.b $w2, $4 +; MIPS32-MSA-NEXT: insert.w $w1[0], $2 +; MIPS32-MSA-NEXT: insert.w $w0[1], $7 +; MIPS32-MSA-NEXT: lw $2, 32($sp) +; MIPS32-MSA-NEXT: slli.b $w2, $w2, 7 +; MIPS32-MSA-NEXT: srai.b $w2, $w2, 7 +; MIPS32-MSA-NEXT: insert.w $w1[1], $1 +; MIPS32-MSA-NEXT: lw $1, 16($sp) +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 20($sp) +; MIPS32-MSA-NEXT: insert.w $w1[2], $2 +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: lw $1, 36($sp) +; MIPS32-MSA-NEXT: insert.w $w1[3], $1 +; MIPS32-MSA-NEXT: shf.b $w0, $w0, 27 +; MIPS32-MSA-NEXT: shf.b $w1, $w1, 27 +; MIPS32-MSA-NEXT: bmnz.v $w1, $w0, $w2 +; MIPS32-MSA-NEXT: shf.b $w0, $w1, 27 +; MIPS32-MSA-NEXT: copy_s.w $2, $w0[0] +; MIPS32-MSA-NEXT: copy_s.w $3, $w0[1] +; MIPS32-MSA-NEXT: copy_s.w $4, $w0[2] +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: copy_s.w $5, $w0[3] + %result = call <16 x i8> @llvm.ct.select.v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %result +} + +; Test 64-bit integer vector (2 x i64 = 128-bit) +define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { +; MIPS64-MSA-LABEL: test_ctselect_v2i64: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: insert.d $w0[0], $5 +; MIPS64-MSA-NEXT: insert.d $w1[0], $7 +; MIPS64-MSA-NEXT: fill.d $w2, $4 +; MIPS64-MSA-NEXT: slli.d $w2, $w2, 63 +; MIPS64-MSA-NEXT: insert.d $w0[1], $6 +; MIPS64-MSA-NEXT: insert.d $w1[1], $8 +; MIPS64-MSA-NEXT: srai.d $w2, $w2, 63 +; MIPS64-MSA-NEXT: bsel.v $w2, $w1, $w0 +; MIPS64-MSA-NEXT: copy_s.d $2, $w2[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w2[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v2i64: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: addiu $sp, $sp, -32 +; MIPS32-MSA-NEXT: .cfi_def_cfa_offset 32 +; MIPS32-MSA-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill +; MIPS32-MSA-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill +; MIPS32-MSA-NEXT: .cfi_offset 31, -4 +; MIPS32-MSA-NEXT: .cfi_offset 30, -8 +; MIPS32-MSA-NEXT: move $fp, $sp +; MIPS32-MSA-NEXT: .cfi_def_cfa_register 30 +; MIPS32-MSA-NEXT: addiu $1, $zero, -16 +; MIPS32-MSA-NEXT: and $sp, $sp, $1 +; MIPS32-MSA-NEXT: lw $2, 56($fp) +; MIPS32-MSA-NEXT: lw $1, 60($fp) +; MIPS32-MSA-NEXT: insert.w $w1[0], $6 +; MIPS32-MSA-NEXT: sw $4, 12($sp) +; MIPS32-MSA-NEXT: sw $4, 4($sp) +; MIPS32-MSA-NEXT: ld.d $w2, 0($sp) +; MIPS32-MSA-NEXT: insert.w $w0[0], $2 +; MIPS32-MSA-NEXT: insert.w $w1[1], $7 +; MIPS32-MSA-NEXT: slli.d $w2, $w2, 63 +; MIPS32-MSA-NEXT: insert.w $w0[1], $1 +; MIPS32-MSA-NEXT: lw $1, 64($fp) +; MIPS32-MSA-NEXT: srai.d $w2, $w2, 63 +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 68($fp) +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: lw $1, 48($fp) +; MIPS32-MSA-NEXT: insert.w $w1[2], $1 +; MIPS32-MSA-NEXT: lw $1, 52($fp) +; MIPS32-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS32-MSA-NEXT: insert.w $w1[3], $1 +; MIPS32-MSA-NEXT: shf.w $w1, $w1, 177 +; MIPS32-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS32-MSA-NEXT: shf.w $w0, $w2, 177 +; MIPS32-MSA-NEXT: copy_s.w $2, $w0[0] +; MIPS32-MSA-NEXT: copy_s.w $3, $w0[1] +; MIPS32-MSA-NEXT: copy_s.w $4, $w0[2] +; MIPS32-MSA-NEXT: copy_s.w $5, $w0[3] +; MIPS32-MSA-NEXT: move $sp, $fp +; MIPS32-MSA-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload +; MIPS32-MSA-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: addiu $sp, $sp, 32 + %result = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %result +} + +; Test single-precision float vector (4 x float = 128-bit) +define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) { +; MIPS64-MSA-LABEL: test_ctselect_v4f32: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: insert.d $w0[0], $7 +; MIPS64-MSA-NEXT: insert.d $w1[0], $5 +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: fill.w $w2, $1 +; MIPS64-MSA-NEXT: insert.d $w0[1], $8 +; MIPS64-MSA-NEXT: insert.d $w1[1], $6 +; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177 +; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS64-MSA-NEXT: shf.w $w0, $w2, 177 +; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w0[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v4f32: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: lw $2, 24($sp) +; MIPS32-MSA-NEXT: insert.w $w1[0], $6 +; MIPS32-MSA-NEXT: lw $1, 28($sp) +; MIPS32-MSA-NEXT: fill.w $w2, $5 +; MIPS32-MSA-NEXT: insert.w $w0[0], $2 +; MIPS32-MSA-NEXT: insert.w $w1[1], $7 +; MIPS32-MSA-NEXT: lw $2, 20($sp) +; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: insert.w $w0[1], $1 +; MIPS32-MSA-NEXT: lw $1, 16($sp) +; MIPS32-MSA-NEXT: insert.w $w1[2], $1 +; MIPS32-MSA-NEXT: lw $1, 32($sp) +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 36($sp) +; MIPS32-MSA-NEXT: insert.w $w1[3], $2 +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: st.w $w2, 0($4) + %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) + ret <4 x float> %result +} + +; Test double-precision float vector (2 x double = 128-bit) +define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) { +; MIPS64-MSA-LABEL: test_ctselect_v2f64: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: insert.d $w0[0], $5 +; MIPS64-MSA-NEXT: insert.d $w1[0], $7 +; MIPS64-MSA-NEXT: fill.d $w2, $4 +; MIPS64-MSA-NEXT: slli.d $w2, $w2, 63 +; MIPS64-MSA-NEXT: insert.d $w0[1], $6 +; MIPS64-MSA-NEXT: insert.d $w1[1], $8 +; MIPS64-MSA-NEXT: srai.d $w2, $w2, 63 +; MIPS64-MSA-NEXT: bsel.v $w2, $w1, $w0 +; MIPS64-MSA-NEXT: copy_s.d $2, $w2[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w2[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v2f64: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: addiu $sp, $sp, -32 +; MIPS32-MSA-NEXT: .cfi_def_cfa_offset 32 +; MIPS32-MSA-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill +; MIPS32-MSA-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill +; MIPS32-MSA-NEXT: .cfi_offset 31, -4 +; MIPS32-MSA-NEXT: .cfi_offset 30, -8 +; MIPS32-MSA-NEXT: move $fp, $sp +; MIPS32-MSA-NEXT: .cfi_def_cfa_register 30 +; MIPS32-MSA-NEXT: addiu $1, $zero, -16 +; MIPS32-MSA-NEXT: and $sp, $sp, $1 +; MIPS32-MSA-NEXT: lw $2, 56($fp) +; MIPS32-MSA-NEXT: lw $1, 60($fp) +; MIPS32-MSA-NEXT: insert.w $w1[0], $6 +; MIPS32-MSA-NEXT: sw $5, 12($sp) +; MIPS32-MSA-NEXT: sw $5, 4($sp) +; MIPS32-MSA-NEXT: ld.d $w2, 0($sp) +; MIPS32-MSA-NEXT: insert.w $w0[0], $2 +; MIPS32-MSA-NEXT: insert.w $w1[1], $7 +; MIPS32-MSA-NEXT: slli.d $w2, $w2, 63 +; MIPS32-MSA-NEXT: insert.w $w0[1], $1 +; MIPS32-MSA-NEXT: lw $1, 64($fp) +; MIPS32-MSA-NEXT: srai.d $w2, $w2, 63 +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 68($fp) +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: lw $1, 48($fp) +; MIPS32-MSA-NEXT: insert.w $w1[2], $1 +; MIPS32-MSA-NEXT: lw $1, 52($fp) +; MIPS32-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS32-MSA-NEXT: insert.w $w1[3], $1 +; MIPS32-MSA-NEXT: shf.w $w1, $w1, 177 +; MIPS32-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS32-MSA-NEXT: st.d $w2, 0($4) +; MIPS32-MSA-NEXT: move $sp, $fp +; MIPS32-MSA-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload +; MIPS32-MSA-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: addiu $sp, $sp, 32 + %result = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) + ret <2 x double> %result +} + +; Test with aligned loads (common case) +define <4 x i32> @test_ctselect_v4i32_aligned_load(i1 %cond, ptr %p1, ptr %p2) { +; MIPS64-MSA-LABEL: test_ctselect_v4i32_aligned_load: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: ld.w $w1, 0($5) +; MIPS64-MSA-NEXT: ld.w $w2, 0($6) +; MIPS64-MSA-NEXT: fill.w $w0, $1 +; MIPS64-MSA-NEXT: slli.w $w0, $w0, 31 +; MIPS64-MSA-NEXT: srai.w $w0, $w0, 31 +; MIPS64-MSA-NEXT: bsel.v $w0, $w2, $w1 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w0[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v4i32_aligned_load: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: fill.w $w0, $4 +; MIPS32-MSA-NEXT: ld.w $w1, 0($5) +; MIPS32-MSA-NEXT: ld.w $w2, 0($6) +; MIPS32-MSA-NEXT: slli.w $w0, $w0, 31 +; MIPS32-MSA-NEXT: srai.w $w0, $w0, 31 +; MIPS32-MSA-NEXT: bsel.v $w0, $w2, $w1 +; MIPS32-MSA-NEXT: copy_s.w $2, $w0[0] +; MIPS32-MSA-NEXT: copy_s.w $3, $w0[1] +; MIPS32-MSA-NEXT: copy_s.w $4, $w0[2] +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: copy_s.w $5, $w0[3] + %a = load <4 x i32>, ptr %p1, align 16 + %b = load <4 x i32>, ptr %p2, align 16 + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +; Test with unaligned loads (stress test) +define <4 x i32> @test_ctselect_v4i32_unaligned_load(i1 %cond, ptr %p1, ptr %p2) { +; MIPS64-MSA-LABEL: test_ctselect_v4i32_unaligned_load: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: ld.w $w1, 0($5) +; MIPS64-MSA-NEXT: ld.w $w2, 0($6) +; MIPS64-MSA-NEXT: fill.w $w0, $1 +; MIPS64-MSA-NEXT: slli.w $w0, $w0, 31 +; MIPS64-MSA-NEXT: srai.w $w0, $w0, 31 +; MIPS64-MSA-NEXT: bsel.v $w0, $w2, $w1 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w0[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v4i32_unaligned_load: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: fill.w $w0, $4 +; MIPS32-MSA-NEXT: ld.w $w1, 0($5) +; MIPS32-MSA-NEXT: ld.w $w2, 0($6) +; MIPS32-MSA-NEXT: slli.w $w0, $w0, 31 +; MIPS32-MSA-NEXT: srai.w $w0, $w0, 31 +; MIPS32-MSA-NEXT: bsel.v $w0, $w2, $w1 +; MIPS32-MSA-NEXT: copy_s.w $2, $w0[0] +; MIPS32-MSA-NEXT: copy_s.w $3, $w0[1] +; MIPS32-MSA-NEXT: copy_s.w $4, $w0[2] +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: copy_s.w $5, $w0[3] + %a = load <4 x i32>, ptr %p1, align 4 + %b = load <4 x i32>, ptr %p2, align 4 + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +; Test with stores to verify result handling +define void @test_ctselect_v4i32_store(i1 %cond, <4 x i32> %a, <4 x i32> %b, ptr %out) { +; MIPS64-MSA-LABEL: test_ctselect_v4i32_store: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: insert.d $w0[0], $7 +; MIPS64-MSA-NEXT: insert.d $w1[0], $5 +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: fill.w $w2, $1 +; MIPS64-MSA-NEXT: insert.d $w0[1], $8 +; MIPS64-MSA-NEXT: insert.d $w1[1], $6 +; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177 +; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: st.w $w2, 0($9) +; +; MIPS32-MSA-LABEL: test_ctselect_v4i32_store: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: lw $2, 24($sp) +; MIPS32-MSA-NEXT: insert.w $w1[0], $6 +; MIPS32-MSA-NEXT: lw $1, 28($sp) +; MIPS32-MSA-NEXT: fill.w $w2, $4 +; MIPS32-MSA-NEXT: insert.w $w0[0], $2 +; MIPS32-MSA-NEXT: insert.w $w1[1], $7 +; MIPS32-MSA-NEXT: lw $2, 20($sp) +; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: insert.w $w0[1], $1 +; MIPS32-MSA-NEXT: lw $1, 16($sp) +; MIPS32-MSA-NEXT: insert.w $w1[2], $1 +; MIPS32-MSA-NEXT: lw $1, 32($sp) +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 36($sp) +; MIPS32-MSA-NEXT: insert.w $w1[3], $2 +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: lw $1, 40($sp) +; MIPS32-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: st.w $w2, 0($1) + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + store <4 x i32> %result, ptr %out, align 16 + ret void +} + +; Test chained selects (multiple conditions) +define <4 x i32> @test_ctselect_v4i32_chain(i1 %cond1, i1 %cond2, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; MIPS64-MSA-LABEL: test_ctselect_v4i32_chain: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: insert.d $w0[0], $8 +; MIPS64-MSA-NEXT: insert.d $w1[0], $6 +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: fill.w $w2, $1 +; MIPS64-MSA-NEXT: sll $1, $5, 0 +; MIPS64-MSA-NEXT: insert.d $w0[1], $9 +; MIPS64-MSA-NEXT: insert.d $w1[1], $7 +; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177 +; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS64-MSA-NEXT: insert.d $w0[0], $10 +; MIPS64-MSA-NEXT: fill.w $w1, $1 +; MIPS64-MSA-NEXT: insert.d $w0[1], $11 +; MIPS64-MSA-NEXT: slli.w $w1, $w1, 31 +; MIPS64-MSA-NEXT: srai.w $w1, $w1, 31 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: bsel.v $w1, $w0, $w2 +; MIPS64-MSA-NEXT: shf.w $w0, $w1, 177 +; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w0[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v4i32_chain: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: lw $2, 24($sp) +; MIPS32-MSA-NEXT: insert.w $w1[0], $6 +; MIPS32-MSA-NEXT: lw $1, 28($sp) +; MIPS32-MSA-NEXT: fill.w $w2, $4 +; MIPS32-MSA-NEXT: insert.w $w0[0], $2 +; MIPS32-MSA-NEXT: insert.w $w1[1], $7 +; MIPS32-MSA-NEXT: lw $2, 20($sp) +; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: insert.w $w0[1], $1 +; MIPS32-MSA-NEXT: lw $1, 16($sp) +; MIPS32-MSA-NEXT: insert.w $w1[2], $1 +; MIPS32-MSA-NEXT: lw $1, 32($sp) +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 36($sp) +; MIPS32-MSA-NEXT: insert.w $w1[3], $2 +; MIPS32-MSA-NEXT: lw $2, 40($sp) +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: lw $1, 44($sp) +; MIPS32-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS32-MSA-NEXT: insert.w $w0[0], $2 +; MIPS32-MSA-NEXT: fill.w $w1, $5 +; MIPS32-MSA-NEXT: insert.w $w0[1], $1 +; MIPS32-MSA-NEXT: lw $1, 48($sp) +; MIPS32-MSA-NEXT: slli.w $w1, $w1, 31 +; MIPS32-MSA-NEXT: srai.w $w1, $w1, 31 +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 52($sp) +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: bsel.v $w1, $w0, $w2 +; MIPS32-MSA-NEXT: copy_s.w $2, $w1[0] +; MIPS32-MSA-NEXT: copy_s.w $3, $w1[1] +; MIPS32-MSA-NEXT: copy_s.w $4, $w1[2] +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: copy_s.w $5, $w1[3] + %tmp = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond1, <4 x i32> %a, <4 x i32> %b) + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond2, <4 x i32> %tmp, <4 x i32> %c) + ret <4 x i32> %result +} + +; Test with arithmetic operations (ensure float vectors work with FP ops) +define <4 x float> @test_ctselect_v4f32_arithmetic(i1 %cond, <4 x float> %x, <4 x float> %y) { +; MIPS64-MSA-LABEL: test_ctselect_v4f32_arithmetic: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: insert.d $w0[0], $7 +; MIPS64-MSA-NEXT: insert.d $w1[0], $5 +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: insert.d $w0[1], $8 +; MIPS64-MSA-NEXT: insert.d $w1[1], $6 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177 +; MIPS64-MSA-NEXT: fsub.w $w2, $w1, $w0 +; MIPS64-MSA-NEXT: fadd.w $w0, $w1, $w0 +; MIPS64-MSA-NEXT: fill.w $w1, $1 +; MIPS64-MSA-NEXT: slli.w $w1, $w1, 31 +; MIPS64-MSA-NEXT: srai.w $w1, $w1, 31 +; MIPS64-MSA-NEXT: bsel.v $w1, $w2, $w0 +; MIPS64-MSA-NEXT: shf.w $w0, $w1, 177 +; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w0[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v4f32_arithmetic: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: lw $2, 24($sp) +; MIPS32-MSA-NEXT: lw $1, 28($sp) +; MIPS32-MSA-NEXT: insert.w $w1[0], $6 +; MIPS32-MSA-NEXT: insert.w $w0[0], $2 +; MIPS32-MSA-NEXT: insert.w $w1[1], $7 +; MIPS32-MSA-NEXT: insert.w $w0[1], $1 +; MIPS32-MSA-NEXT: lw $1, 32($sp) +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 36($sp) +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: lw $1, 16($sp) +; MIPS32-MSA-NEXT: insert.w $w1[2], $1 +; MIPS32-MSA-NEXT: lw $1, 20($sp) +; MIPS32-MSA-NEXT: insert.w $w1[3], $1 +; MIPS32-MSA-NEXT: fsub.w $w2, $w1, $w0 +; MIPS32-MSA-NEXT: fadd.w $w0, $w1, $w0 +; MIPS32-MSA-NEXT: fill.w $w1, $5 +; MIPS32-MSA-NEXT: slli.w $w1, $w1, 31 +; MIPS32-MSA-NEXT: srai.w $w1, $w1, 31 +; MIPS32-MSA-NEXT: bsel.v $w1, $w2, $w0 +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: st.w $w1, 0($4) + %sum = fadd <4 x float> %x, %y + %diff = fsub <4 x float> %x, %y + %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %sum, <4 x float> %diff) + ret <4 x float> %result +} + +; Test with mixed operations (load, compute, select, store) +define void @test_ctselect_v4i32_mixed(i1 %cond, ptr %p1, ptr %p2, ptr %out) { +; MIPS64-MSA-LABEL: test_ctselect_v4i32_mixed: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: ld.w $w0, 0($5) +; MIPS64-MSA-NEXT: ld.w $w1, 0($6) +; MIPS64-MSA-NEXT: fill.w $w2, $1 +; MIPS64-MSA-NEXT: addvi.w $w0, $w0, 1 +; MIPS64-MSA-NEXT: addvi.w $w1, $w1, 2 +; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: bsel.v $w2, $w1, $w0 +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: st.w $w2, 0($7) +; +; MIPS32-MSA-LABEL: test_ctselect_v4i32_mixed: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: ld.w $w0, 0($5) +; MIPS32-MSA-NEXT: ld.w $w1, 0($6) +; MIPS32-MSA-NEXT: fill.w $w2, $4 +; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: addvi.w $w0, $w0, 1 +; MIPS32-MSA-NEXT: addvi.w $w1, $w1, 2 +; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: bsel.v $w2, $w1, $w0 +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: st.w $w2, 0($7) + %a = load <4 x i32>, ptr %p1, align 16 + %b = load <4 x i32>, ptr %p2, align 16 + %a_plus_1 = add <4 x i32> %a, + %b_plus_2 = add <4 x i32> %b, + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a_plus_1, <4 x i32> %b_plus_2) + store <4 x i32> %result, ptr %out, align 16 + ret void +} + +; Test with function arguments directly (no loads) +define <4 x i32> @test_ctselect_v4i32_args(i1 %cond, <4 x i32> %a, <4 x i32> %b) nounwind { +; MIPS64-MSA-LABEL: test_ctselect_v4i32_args: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: insert.d $w0[0], $7 +; MIPS64-MSA-NEXT: insert.d $w1[0], $5 +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: fill.w $w2, $1 +; MIPS64-MSA-NEXT: insert.d $w0[1], $8 +; MIPS64-MSA-NEXT: insert.d $w1[1], $6 +; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177 +; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS64-MSA-NEXT: shf.w $w0, $w2, 177 +; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w0[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v4i32_args: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: lw $2, 24($sp) +; MIPS32-MSA-NEXT: insert.w $w1[0], $6 +; MIPS32-MSA-NEXT: lw $1, 28($sp) +; MIPS32-MSA-NEXT: fill.w $w2, $4 +; MIPS32-MSA-NEXT: insert.w $w0[0], $2 +; MIPS32-MSA-NEXT: insert.w $w1[1], $7 +; MIPS32-MSA-NEXT: lw $2, 20($sp) +; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: insert.w $w0[1], $1 +; MIPS32-MSA-NEXT: lw $1, 16($sp) +; MIPS32-MSA-NEXT: insert.w $w1[2], $1 +; MIPS32-MSA-NEXT: lw $1, 32($sp) +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 36($sp) +; MIPS32-MSA-NEXT: insert.w $w1[3], $2 +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS32-MSA-NEXT: copy_s.w $2, $w2[0] +; MIPS32-MSA-NEXT: copy_s.w $3, $w2[1] +; MIPS32-MSA-NEXT: copy_s.w $4, $w2[2] +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: copy_s.w $5, $w2[3] + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +; Test with multiple uses of result +define <4 x i32> @test_ctselect_v4i32_multi_use(i1 %cond, <4 x i32> %a, <4 x i32> %b) { +; MIPS64-MSA-LABEL: test_ctselect_v4i32_multi_use: +; MIPS64-MSA: # %bb.0: +; MIPS64-MSA-NEXT: insert.d $w0[0], $7 +; MIPS64-MSA-NEXT: insert.d $w1[0], $5 +; MIPS64-MSA-NEXT: sll $1, $4, 0 +; MIPS64-MSA-NEXT: fill.w $w2, $1 +; MIPS64-MSA-NEXT: insert.d $w0[1], $8 +; MIPS64-MSA-NEXT: insert.d $w1[1], $6 +; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177 +; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS64-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS64-MSA-NEXT: addv.w $w0, $w2, $w2 +; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177 +; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0] +; MIPS64-MSA-NEXT: jr $ra +; MIPS64-MSA-NEXT: copy_s.d $3, $w0[1] +; +; MIPS32-MSA-LABEL: test_ctselect_v4i32_multi_use: +; MIPS32-MSA: # %bb.0: +; MIPS32-MSA-NEXT: lw $2, 24($sp) +; MIPS32-MSA-NEXT: insert.w $w1[0], $6 +; MIPS32-MSA-NEXT: lw $1, 28($sp) +; MIPS32-MSA-NEXT: fill.w $w2, $4 +; MIPS32-MSA-NEXT: insert.w $w0[0], $2 +; MIPS32-MSA-NEXT: insert.w $w1[1], $7 +; MIPS32-MSA-NEXT: lw $2, 20($sp) +; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31 +; MIPS32-MSA-NEXT: insert.w $w0[1], $1 +; MIPS32-MSA-NEXT: lw $1, 16($sp) +; MIPS32-MSA-NEXT: insert.w $w1[2], $1 +; MIPS32-MSA-NEXT: lw $1, 32($sp) +; MIPS32-MSA-NEXT: insert.w $w0[2], $1 +; MIPS32-MSA-NEXT: lw $1, 36($sp) +; MIPS32-MSA-NEXT: insert.w $w1[3], $2 +; MIPS32-MSA-NEXT: insert.w $w0[3], $1 +; MIPS32-MSA-NEXT: bsel.v $w2, $w0, $w1 +; MIPS32-MSA-NEXT: addv.w $w0, $w2, $w2 +; MIPS32-MSA-NEXT: copy_s.w $2, $w0[0] +; MIPS32-MSA-NEXT: copy_s.w $3, $w0[1] +; MIPS32-MSA-NEXT: copy_s.w $4, $w0[2] +; MIPS32-MSA-NEXT: jr $ra +; MIPS32-MSA-NEXT: copy_s.w $5, $w0[3] + %sel = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + %add = add <4 x i32> %sel, %sel ; Use result twice + ret <4 x i32> %add +} + +declare <4 x i32> @llvm.ct.select.v4i32(i1, <4 x i32>, <4 x i32>) +declare <8 x i16> @llvm.ct.select.v8i16(i1, <8 x i16>, <8 x i16>) +declare <16 x i8> @llvm.ct.select.v16i8(i1, <16 x i8>, <16 x i8>) +declare <2 x i64> @llvm.ct.select.v2i64(i1, <2 x i64>, <2 x i64>) +declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>) +declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>) diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback.ll b/llvm/test/CodeGen/Mips/ctselect-fallback.ll new file mode 100644 index 0000000000000..22b24b33cff3c --- /dev/null +++ b/llvm/test/CodeGen/Mips/ctselect-fallback.ll @@ -0,0 +1,615 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=mipsel-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M32 +; RUN: llc < %s -mtriple=mips64el-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M64 + +; Test basic ct.select functionality for scalar types +define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) { +; M32-LABEL: test_ctselect_i8: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: addiu $2, $1, -1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $2, $6 +; M32-NEXT: and $1, $1, $5 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_i8: +; M64: # %bb.0: +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sll $1, $6, 0 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: addiu $3, $2, -1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: sll $3, $5, 0 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) + ret i8 %result +} + +define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) { +; M32-LABEL: test_ctselect_i16: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: addiu $2, $1, -1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $2, $6 +; M32-NEXT: and $1, $1, $5 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_i16: +; M64: # %bb.0: +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sll $1, $6, 0 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: addiu $3, $2, -1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: sll $3, $5, 0 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) + ret i16 %result +} + +define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { +; M32-LABEL: test_ctselect_i32: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: addiu $2, $1, -1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $2, $6 +; M32-NEXT: and $1, $1, $5 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_i32: +; M64: # %bb.0: +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sll $1, $6, 0 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: addiu $3, $2, -1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: sll $3, $5, 0 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { +; M32-LABEL: test_ctselect_i64: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: lw $2, 16($sp) +; M32-NEXT: addiu $3, $1, -1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $4, $1, $6 +; M32-NEXT: and $2, $3, $2 +; M32-NEXT: and $1, $1, $7 +; M32-NEXT: or $2, $4, $2 +; M32-NEXT: lw $4, 20($sp) +; M32-NEXT: and $3, $3, $4 +; M32-NEXT: jr $ra +; M32-NEXT: or $3, $1, $3 +; +; M64-LABEL: test_ctselect_i64: +; M64: # %bb.0: +; M64-NEXT: andi $1, $4, 1 +; M64-NEXT: daddiu $2, $1, -1 +; M64-NEXT: dnegu $1, $1 +; M64-NEXT: and $2, $2, $6 +; M64-NEXT: and $1, $1, $5 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $1, $2 + %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b) + ret i64 %result +} + +define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) { +; M32-LABEL: test_ctselect_ptr: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: addiu $2, $1, -1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $2, $6 +; M32-NEXT: and $1, $1, $5 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_ptr: +; M64: # %bb.0: +; M64-NEXT: andi $1, $4, 1 +; M64-NEXT: daddiu $2, $1, -1 +; M64-NEXT: dnegu $1, $1 +; M64-NEXT: and $2, $2, $6 +; M64-NEXT: and $1, $1, $5 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $1, $2 + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) + ret ptr %result +} + +; Test with constant conditions +define i32 @test_ctselect_const_true(i32 %a, i32 %b) { +; M32-LABEL: test_ctselect_const_true: +; M32: # %bb.0: +; M32-NEXT: jr $ra +; M32-NEXT: move $2, $4 +; +; M64-LABEL: test_ctselect_const_true: +; M64: # %bb.0: +; M64-NEXT: jr $ra +; M64-NEXT: sll $2, $4, 0 + %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_const_false(i32 %a, i32 %b) { +; M32-LABEL: test_ctselect_const_false: +; M32: # %bb.0: +; M32-NEXT: jr $ra +; M32-NEXT: move $2, $5 +; +; M64-LABEL: test_ctselect_const_false: +; M64: # %bb.0: +; M64-NEXT: jr $ra +; M64-NEXT: sll $2, $5, 0 + %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) + ret i32 %result +} + +; Test with comparison conditions +define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) { +; M32-LABEL: test_ctselect_icmp_eq: +; M32: # %bb.0: +; M32-NEXT: xor $1, $4, $5 +; M32-NEXT: sltu $1, $zero, $1 +; M32-NEXT: negu $2, $1 +; M32-NEXT: addiu $1, $1, -1 +; M32-NEXT: and $2, $2, $7 +; M32-NEXT: and $1, $1, $6 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_icmp_eq: +; M64: # %bb.0: +; M64-NEXT: sll $1, $5, 0 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sll $3, $7, 0 +; M64-NEXT: xor $1, $2, $1 +; M64-NEXT: sltu $1, $zero, $1 +; M64-NEXT: negu $2, $1 +; M64-NEXT: addiu $1, $1, -1 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: sll $3, $6, 0 +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $1, $2 + %cond = icmp eq i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) { +; M32-LABEL: test_ctselect_icmp_ne: +; M32: # %bb.0: +; M32-NEXT: xor $1, $4, $5 +; M32-NEXT: sltiu $1, $1, 1 +; M32-NEXT: negu $2, $1 +; M32-NEXT: addiu $1, $1, -1 +; M32-NEXT: and $2, $2, $7 +; M32-NEXT: and $1, $1, $6 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_icmp_ne: +; M64: # %bb.0: +; M64-NEXT: sll $1, $5, 0 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sll $3, $7, 0 +; M64-NEXT: xor $1, $2, $1 +; M64-NEXT: sltiu $1, $1, 1 +; M64-NEXT: negu $2, $1 +; M64-NEXT: addiu $1, $1, -1 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: sll $3, $6, 0 +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $1, $2 + %cond = icmp ne i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) { +; M32-LABEL: test_ctselect_icmp_slt: +; M32: # %bb.0: +; M32-NEXT: slt $1, $4, $5 +; M32-NEXT: xori $1, $1, 1 +; M32-NEXT: negu $2, $1 +; M32-NEXT: addiu $1, $1, -1 +; M32-NEXT: and $2, $2, $7 +; M32-NEXT: and $1, $1, $6 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_icmp_slt: +; M64: # %bb.0: +; M64-NEXT: sll $1, $5, 0 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sll $3, $7, 0 +; M64-NEXT: slt $1, $2, $1 +; M64-NEXT: xori $1, $1, 1 +; M64-NEXT: negu $2, $1 +; M64-NEXT: addiu $1, $1, -1 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: sll $3, $6, 0 +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $1, $2 + %cond = icmp slt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) { +; M32-LABEL: test_ctselect_icmp_ult: +; M32: # %bb.0: +; M32-NEXT: sltu $1, $4, $5 +; M32-NEXT: xori $1, $1, 1 +; M32-NEXT: negu $2, $1 +; M32-NEXT: addiu $1, $1, -1 +; M32-NEXT: and $2, $2, $7 +; M32-NEXT: and $1, $1, $6 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_icmp_ult: +; M64: # %bb.0: +; M64-NEXT: sll $1, $5, 0 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sll $3, $7, 0 +; M64-NEXT: sltu $1, $2, $1 +; M64-NEXT: xori $1, $1, 1 +; M64-NEXT: negu $2, $1 +; M64-NEXT: addiu $1, $1, -1 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: sll $3, $6, 0 +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $1, $2 + %cond = icmp ult i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test with memory operands +define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { +; M32-LABEL: test_ctselect_load: +; M32: # %bb.0: +; M32-NEXT: andi $2, $4, 1 +; M32-NEXT: lw $1, 0($6) +; M32-NEXT: addiu $3, $2, -1 +; M32-NEXT: negu $2, $2 +; M32-NEXT: and $1, $3, $1 +; M32-NEXT: lw $3, 0($5) +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $2, $1 +; +; M64-LABEL: test_ctselect_load: +; M64: # %bb.0: +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: lw $1, 0($6) +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: addiu $3, $2, -1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: lw $3, 0($5) +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %a = load i32, ptr %p1 + %b = load i32, ptr %p2 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test nested ctselect calls +define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { +; M32-LABEL: test_ctselect_nested: +; M32: # %bb.0: +; M32-NEXT: andi $1, $5, 1 +; M32-NEXT: addiu $2, $1, -1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $2, $7 +; M32-NEXT: and $1, $1, $6 +; M32-NEXT: or $1, $1, $2 +; M32-NEXT: andi $2, $4, 1 +; M32-NEXT: negu $3, $2 +; M32-NEXT: addiu $2, $2, -1 +; M32-NEXT: and $1, $3, $1 +; M32-NEXT: lw $3, 16($sp) +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_ctselect_nested: +; M64: # %bb.0: +; M64-NEXT: sll $2, $5, 0 +; M64-NEXT: sll $1, $7, 0 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: addiu $3, $2, -1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: sll $3, $6, 0 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: or $1, $2, $1 +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: negu $3, $2 +; M64-NEXT: addiu $2, $2, -1 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: sll $3, $8, 0 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $1, $2 + %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b) + %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c) + ret i32 %result +} + +; Test float (32-bit) +define float @test_ctselect_f32(i1 %cond, float %a, float %b) { +; M32-LABEL: test_ctselect_f32: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: addiu $2, $1, -1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $2, $6 +; M32-NEXT: and $1, $1, $5 +; M32-NEXT: or $1, $1, $2 +; M32-NEXT: jr $ra +; M32-NEXT: mtc1 $1, $f0 +; +; M64-LABEL: test_ctselect_f32: +; M64: # %bb.0: +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: mfc1 $1, $f14 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: addiu $3, $2, -1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: mfc1 $3, $f13 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: or $1, $2, $1 +; M64-NEXT: jr $ra +; M64-NEXT: mtc1 $1, $f0 + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +; Test double (64-bit) +define double @test_ctselect_f64(i1 %cond, double %a, double %b) { +; M32-LABEL: test_ctselect_f64: +; M32: # %bb.0: +; M32-NEXT: addiu $sp, $sp, -16 +; M32-NEXT: .cfi_def_cfa_offset 16 +; M32-NEXT: mtc1 $6, $f0 +; M32-NEXT: mtc1 $7, $f1 +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: lw $3, 36($sp) +; M32-NEXT: addiu $2, $1, -1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: sdc1 $f0, 8($sp) +; M32-NEXT: and $3, $2, $3 +; M32-NEXT: lw $4, 12($sp) +; M32-NEXT: and $4, $1, $4 +; M32-NEXT: or $3, $4, $3 +; M32-NEXT: sw $3, 4($sp) +; M32-NEXT: lw $3, 32($sp) +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: lw $3, 8($sp) +; M32-NEXT: and $1, $1, $3 +; M32-NEXT: or $1, $1, $2 +; M32-NEXT: sw $1, 0($sp) +; M32-NEXT: ldc1 $f0, 0($sp) +; M32-NEXT: jr $ra +; M32-NEXT: addiu $sp, $sp, 16 +; +; M64-LABEL: test_ctselect_f64: +; M64: # %bb.0: +; M64-NEXT: andi $2, $4, 1 +; M64-NEXT: dmfc1 $1, $f14 +; M64-NEXT: daddiu $3, $2, -1 +; M64-NEXT: dnegu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: dmfc1 $3, $f13 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: or $1, $2, $1 +; M64-NEXT: jr $ra +; M64-NEXT: dmtc1 $1, $f0 + %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) + ret double %result +} + + +; Test chained float selects +define float @test_ctselect_f32_chain(i1 %cond1, i1 %cond2, float %a, float %b, float %c) { +; M32-LABEL: test_ctselect_f32_chain: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: addiu $2, $1, -1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $2, $7 +; M32-NEXT: and $1, $1, $6 +; M32-NEXT: or $1, $1, $2 +; M32-NEXT: andi $2, $5, 1 +; M32-NEXT: negu $3, $2 +; M32-NEXT: addiu $2, $2, -1 +; M32-NEXT: and $1, $3, $1 +; M32-NEXT: lw $3, 16($sp) +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: or $1, $1, $2 +; M32-NEXT: jr $ra +; M32-NEXT: mtc1 $1, $f0 +; +; M64-LABEL: test_ctselect_f32_chain: +; M64: # %bb.0: +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: mfc1 $1, $f15 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: addiu $3, $2, -1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: mfc1 $3, $f14 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: or $1, $2, $1 +; M64-NEXT: sll $2, $5, 0 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: negu $3, $2 +; M64-NEXT: addiu $2, $2, -1 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: mfc1 $3, $f16 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: or $1, $1, $2 +; M64-NEXT: jr $ra +; M64-NEXT: mtc1 $1, $f0 + %tmp = call float @llvm.ct.select.f32(i1 %cond1, float %a, float %b) + %result = call float @llvm.ct.select.f32(i1 %cond2, float %tmp, float %c) + ret float %result +} + +; Test with float load +define float @test_ctselect_f32_load(i1 %cond, ptr %p1, ptr %p2) { +; M32-LABEL: test_ctselect_f32_load: +; M32: # %bb.0: +; M32-NEXT: andi $2, $4, 1 +; M32-NEXT: lw $1, 0($6) +; M32-NEXT: addiu $3, $2, -1 +; M32-NEXT: negu $2, $2 +; M32-NEXT: and $1, $3, $1 +; M32-NEXT: lw $3, 0($5) +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: or $1, $2, $1 +; M32-NEXT: jr $ra +; M32-NEXT: mtc1 $1, $f0 +; +; M64-LABEL: test_ctselect_f32_load: +; M64: # %bb.0: +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: lw $1, 0($6) +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: addiu $3, $2, -1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: lw $3, 0($5) +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: or $1, $2, $1 +; M64-NEXT: jr $ra +; M64-NEXT: mtc1 $1, $f0 + %a = load float, ptr %p1 + %b = load float, ptr %p2 + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +; Test with double load +define double @test_ctselect_f64_load(i1 %cond, ptr %p1, ptr %p2) { +; M32-LABEL: test_ctselect_f64_load: +; M32: # %bb.0: +; M32-NEXT: addiu $sp, $sp, -8 +; M32-NEXT: .cfi_def_cfa_offset 8 +; M32-NEXT: andi $2, $4, 1 +; M32-NEXT: lw $1, 4($6) +; M32-NEXT: lw $4, 4($5) +; M32-NEXT: addiu $3, $2, -1 +; M32-NEXT: negu $2, $2 +; M32-NEXT: and $1, $3, $1 +; M32-NEXT: and $4, $2, $4 +; M32-NEXT: or $1, $4, $1 +; M32-NEXT: sw $1, 4($sp) +; M32-NEXT: lw $1, 0($6) +; M32-NEXT: and $1, $3, $1 +; M32-NEXT: lw $3, 0($5) +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: or $1, $2, $1 +; M32-NEXT: sw $1, 0($sp) +; M32-NEXT: ldc1 $f0, 0($sp) +; M32-NEXT: jr $ra +; M32-NEXT: addiu $sp, $sp, 8 +; +; M64-LABEL: test_ctselect_f64_load: +; M64: # %bb.0: +; M64-NEXT: andi $2, $4, 1 +; M64-NEXT: ld $1, 0($6) +; M64-NEXT: daddiu $3, $2, -1 +; M64-NEXT: dnegu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: ld $3, 0($5) +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: or $1, $2, $1 +; M64-NEXT: jr $ra +; M64-NEXT: dmtc1 $1, $f0 + %a = load double, ptr %p1 + %b = load double, ptr %p2 + %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) + ret double %result +} + +; Test mixed with arithmetic +define float @test_ctselect_f32_arithmetic(i1 %cond, float %x, float %y) { +; M32-LABEL: test_ctselect_f32_arithmetic: +; M32: # %bb.0: +; M32-NEXT: mtc1 $6, $f0 +; M32-NEXT: mtc1 $5, $f1 +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: negu $2, $1 +; M32-NEXT: addiu $1, $1, -1 +; M32-NEXT: add.s $f2, $f1, $f0 +; M32-NEXT: sub.s $f0, $f1, $f0 +; M32-NEXT: mfc1 $3, $f2 +; M32-NEXT: and $2, $2, $3 +; M32-NEXT: mfc1 $3, $f0 +; M32-NEXT: and $1, $1, $3 +; M32-NEXT: or $1, $2, $1 +; M32-NEXT: jr $ra +; M32-NEXT: mtc1 $1, $f0 +; +; M64-LABEL: test_ctselect_f32_arithmetic: +; M64: # %bb.0: +; M64-NEXT: add.s $f0, $f13, $f14 +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: negu $2, $1 +; M64-NEXT: addiu $1, $1, -1 +; M64-NEXT: mfc1 $3, $f0 +; M64-NEXT: sub.s $f0, $f13, $f14 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: mfc1 $3, $f0 +; M64-NEXT: and $1, $1, $3 +; M64-NEXT: or $1, $2, $1 +; M64-NEXT: jr $ra +; M64-NEXT: mtc1 $1, $f0 + %sum = fadd float %x, %y + %diff = fsub float %x, %y + %result = call float @llvm.ct.select.f32(i1 %cond, float %sum, float %diff) + ret float %result +} + +; Declare the intrinsics +; Declare the intrinsics +declare i8 @llvm.ct.select.i8(i1, i8, i8) +declare i16 @llvm.ct.select.i16(i1, i16, i16) +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare ptr @llvm.ct.select.p0(i1, ptr, ptr) +declare i64 @llvm.ct.select.i64(i1, i64, i64) +declare double @llvm.ct.select.f64(i1, double, double) diff --git a/llvm/test/CodeGen/Mips/ctselect-side-effects.ll b/llvm/test/CodeGen/Mips/ctselect-side-effects.ll new file mode 100644 index 0000000000000..9a0263ad5915c --- /dev/null +++ b/llvm/test/CodeGen/Mips/ctselect-side-effects.ll @@ -0,0 +1,183 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=mipsel-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M32 +; RUN: llc < %s -mtriple=mips64el-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M64 + +; Test 1: Basic optimizations should still work +define i32 @test_basic_opts(i32 %x) { +; M32-LABEL: test_basic_opts: +; M32: # %bb.0: +; M32-NEXT: jr $ra +; M32-NEXT: move $2, $4 +; +; M64-LABEL: test_basic_opts: +; M64: # %bb.0: +; M64-NEXT: jr $ra +; M64-NEXT: sll $2, $4, 0 + %a = or i32 %x, 0 + %b = and i32 %a, -1 + %c = xor i32 %b, 0 + ret i32 %c +} + +; Test 2: Constant folding should work +define i32 @test_constant_fold() { +; M32-LABEL: test_constant_fold: +; M32: # %bb.0: +; M32-NEXT: jr $ra +; M32-NEXT: addiu $2, $zero, 0 +; +; M64-LABEL: test_constant_fold: +; M64: # %bb.0: +; M64-NEXT: jr $ra +; M64-NEXT: addiu $2, $zero, 0 + %a = xor i32 -1, -1 ; Should fold to 0 + ret i32 %a +} + +; Test 3: Protected pattern should NOT have branches +define i32 @test_protected_no_branch(i1 %cond, i32 %a, i32 %b) { +; M32-LABEL: test_protected_no_branch: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: addiu $2, $1, -1 +; M32-NEXT: negu $1, $1 +; M32-NEXT: and $2, $2, $6 +; M32-NEXT: and $1, $1, $5 +; M32-NEXT: jr $ra +; M32-NEXT: or $2, $1, $2 +; +; M64-LABEL: test_protected_no_branch: +; M64: # %bb.0: +; M64-NEXT: sll $2, $4, 0 +; M64-NEXT: sll $1, $6, 0 +; M64-NEXT: andi $2, $2, 1 +; M64-NEXT: addiu $3, $2, -1 +; M64-NEXT: negu $2, $2 +; M64-NEXT: and $1, $3, $1 +; M64-NEXT: sll $3, $5, 0 +; M64-NEXT: and $2, $2, $3 +; M64-NEXT: jr $ra +; M64-NEXT: or $2, $2, $1 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test 4: Explicit branch should still generate branches +define i32 @test_explicit_branch(i1 %cond, i32 %a, i32 %b) { +; M32-LABEL: test_explicit_branch: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: beqz $1, $BB3_2 +; M32-NEXT: nop +; M32-NEXT: # %bb.1: # %true +; M32-NEXT: jr $ra +; M32-NEXT: move $2, $5 +; M32-NEXT: $BB3_2: # %false +; M32-NEXT: jr $ra +; M32-NEXT: move $2, $6 +; +; M64-LABEL: test_explicit_branch: +; M64: # %bb.0: +; M64-NEXT: sll $1, $4, 0 +; M64-NEXT: andi $1, $1, 1 +; M64-NEXT: beqz $1, .LBB3_2 +; M64-NEXT: nop +; M64-NEXT: # %bb.1: # %true +; M64-NEXT: jr $ra +; M64-NEXT: sll $2, $5, 0 +; M64-NEXT: .LBB3_2: # %false +; M64-NEXT: jr $ra +; M64-NEXT: sll $2, $6, 0 + br i1 %cond, label %true, label %false +true: + ret i32 %a +false: + ret i32 %b +} + +; Test 5: Regular select (not ct.select) - whatever wasm wants to do +define i32 @test_regular_select(i1 %cond, i32 %a, i32 %b) { +; M32-LABEL: test_regular_select: +; M32: # %bb.0: +; M32-NEXT: andi $1, $4, 1 +; M32-NEXT: movn $6, $5, $1 +; M32-NEXT: jr $ra +; M32-NEXT: move $2, $6 +; +; M64-LABEL: test_regular_select: +; M64: # %bb.0: +; M64-NEXT: sll $3, $4, 0 +; M64-NEXT: sll $2, $6, 0 +; M64-NEXT: sll $1, $5, 0 +; M64-NEXT: andi $3, $3, 1 +; M64-NEXT: jr $ra +; M64-NEXT: movn $2, $1, $3 + %result = select i1 %cond, i32 %a, i32 %b + ret i32 %result +} + +; Test if XOR with all-ones still gets optimized +define i32 @test_xor_all_ones() { +; M32-LABEL: test_xor_all_ones: +; M32: # %bb.0: +; M32-NEXT: jr $ra +; M32-NEXT: addiu $2, $zero, 0 +; +; M64-LABEL: test_xor_all_ones: +; M64: # %bb.0: +; M64-NEXT: jr $ra +; M64-NEXT: addiu $2, $zero, 0 + %xor1 = xor i32 -1, -1 ; Should optimize to 0 + ret i32 %xor1 +} + +define i32 @test_xor_same_value(i32 %x) { +; M32-LABEL: test_xor_same_value: +; M32: # %bb.0: +; M32-NEXT: jr $ra +; M32-NEXT: addiu $2, $zero, 0 +; +; M64-LABEL: test_xor_same_value: +; M64: # %bb.0: +; M64-NEXT: jr $ra +; M64-NEXT: addiu $2, $zero, 0 + %xor2 = xor i32 %x, %x ; Should optimize to 0 + ret i32 %xor2 +} + +define i32 @test_normal_ops(i32 %x) { +; M32-LABEL: test_normal_ops: +; M32: # %bb.0: +; M32-NEXT: jr $ra +; M32-NEXT: move $2, $4 +; +; M64-LABEL: test_normal_ops: +; M64: # %bb.0: +; M64-NEXT: jr $ra +; M64-NEXT: sll $2, $4, 0 + %or1 = or i32 %x, 0 ; Should optimize to %x + %and1 = and i32 %or1, -1 ; Should optimize to %x + %xor1 = xor i32 %and1, 0 ; Should optimize to %x + ret i32 %xor1 +} + +; This simulates what the reviewer is worried about +define i32 @test_xor_with_const_operands() { +; M32-LABEL: test_xor_with_const_operands: +; M32: # %bb.0: +; M32-NEXT: jr $ra +; M32-NEXT: addiu $2, $zero, 0 +; +; M64-LABEL: test_xor_with_const_operands: +; M64: # %bb.0: +; M64-NEXT: jr $ra +; M64-NEXT: addiu $2, $zero, 0 + %a = xor i32 -1, -1 + %b = xor i32 0, 0 + %c = xor i32 42, 42 + %result = or i32 %a, %b + %final = or i32 %result, %c + ret i32 %final ; Should optimize to 0 +} + +declare i32 @llvm.ct.select.i32(i1, i32, i32) diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll new file mode 100644 index 0000000000000..860f64c3672b0 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll @@ -0,0 +1,462 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv64 -O3 | FileCheck %s --check-prefix=RV64 +; RUN: llc < %s -mtriple=riscv32 -O3 | FileCheck %s --check-prefix=RV32 + +; Test with small integer types +define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) { +; RV64-LABEL: test_ctselect_i1: +; RV64: # %bb.0: +; RV64-NEXT: and a1, a0, a1 +; RV64-NEXT: xori a0, a0, 1 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_i1: +; RV32: # %bb.0: +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: xori a0, a0, 1 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: ret + %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b) + ret i1 %result +} + +; Test with extremal values +define i32 @test_ctselect_extremal_values(i1 %cond) { +; RV64-LABEL: test_ctselect_extremal_values: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: lui a1, 524288 +; RV64-NEXT: addi a2, a0, -1 +; RV64-NEXT: negw a0, a0 +; RV64-NEXT: and a1, a2, a1 +; RV64-NEXT: slli a0, a0, 33 +; RV64-NEXT: srli a0, a0, 33 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_extremal_values: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: lui a1, 524288 +; RV32-NEXT: addi a2, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: srli a0, a0, 1 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648) + ret i32 %result +} + +; Test with null pointers +define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { +; RV64-LABEL: test_ctselect_null_ptr: +; RV64: # %bb.0: +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_null_ptr: +; RV32: # %bb.0: +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: ret + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null) + ret ptr %result +} + +; Test with function pointers +define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) { +; RV64-LABEL: test_ctselect_function_ptr: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_function_ptr: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: ret + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2) + ret ptr %result +} + +; Test with condition from icmp on pointers +define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) { +; RV64-LABEL: test_ctselect_ptr_cmp: +; RV64: # %bb.0: +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: snez a0, a0 +; RV64-NEXT: neg a1, a0 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a1, a1, a3 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_ptr_cmp: +; RV32: # %bb.0: +; RV32-NEXT: xor a0, a0, a1 +; RV32-NEXT: snez a0, a0 +; RV32-NEXT: neg a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a1, a1, a3 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %cmp = icmp eq ptr %p1, %p2 + %result = call ptr @llvm.ct.select.p0(i1 %cmp, ptr %a, ptr %b) + ret ptr %result +} + +; Test with struct pointer types +%struct.pair = type { i32, i32 } + +define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) { +; RV64-LABEL: test_ctselect_struct_ptr: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_struct_ptr: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: ret + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) + ret ptr %result +} + +; Test with deeply nested conditions +define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { +; RV64-LABEL: test_ctselect_deeply_nested: +; RV64: # %bb.0: +; RV64-NEXT: lw t0, 0(sp) +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: andi a1, a1, 1 +; RV64-NEXT: andi a2, a2, 1 +; RV64-NEXT: andi a3, a3, 1 +; RV64-NEXT: addi t1, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a5, t1, a5 +; RV64-NEXT: neg t1, a1 +; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: and a0, a0, a4 +; RV64-NEXT: neg a4, a2 +; RV64-NEXT: addi a2, a2, -1 +; RV64-NEXT: and a1, a1, a6 +; RV64-NEXT: neg a6, a3 +; RV64-NEXT: addi a3, a3, -1 +; RV64-NEXT: and a2, a2, a7 +; RV64-NEXT: or a0, a0, a5 +; RV64-NEXT: and a0, t1, a0 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: and a0, a4, a0 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: and a0, a6, a0 +; RV64-NEXT: and a1, a3, t0 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_deeply_nested: +; RV32: # %bb.0: +; RV32-NEXT: lw t0, 0(sp) +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: andi a1, a1, 1 +; RV32-NEXT: andi a2, a2, 1 +; RV32-NEXT: andi a3, a3, 1 +; RV32-NEXT: addi t1, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a5, t1, a5 +; RV32-NEXT: neg t1, a1 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a0, a0, a4 +; RV32-NEXT: neg a4, a2 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a1, a1, a6 +; RV32-NEXT: neg a6, a3 +; RV32-NEXT: addi a3, a3, -1 +; RV32-NEXT: and a2, a2, a7 +; RV32-NEXT: or a0, a0, a5 +; RV32-NEXT: and a0, t1, a0 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: and a0, a4, a0 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: and a0, a6, a0 +; RV32-NEXT: and a1, a3, t0 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) + %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c) + %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d) + %sel4 = call i32 @llvm.ct.select.i32(i1 %c4, i32 %sel3, i32 %e) + ret i32 %sel4 +} + +; This test demonstrates the FStar cmovznz4 pattern using ct.select +; Based on https://godbolt.org/z/6Kb71Ks7z +; Shows that NoMerge flag prevents DAG optimization from introducing branches +define void @cmovznz4_fstar_original(i64 %cin, ptr %x, ptr %y, ptr %r) { +; RV64-LABEL: cmovznz4_fstar_original: +; RV64: # %bb.0: # %entry +; RV64-NEXT: mv a4, a1 +; RV64-NEXT: beqz a0, .LBB7_2 +; RV64-NEXT: # %bb.1: # %entry +; RV64-NEXT: mv a4, a2 +; RV64-NEXT: .LBB7_2: # %entry +; RV64-NEXT: beqz a0, .LBB7_6 +; RV64-NEXT: # %bb.3: # %entry +; RV64-NEXT: addi a5, a2, 8 +; RV64-NEXT: bnez a0, .LBB7_7 +; RV64-NEXT: .LBB7_4: +; RV64-NEXT: addi a6, a1, 16 +; RV64-NEXT: ld a4, 0(a4) +; RV64-NEXT: ld a5, 0(a5) +; RV64-NEXT: ld a6, 0(a6) +; RV64-NEXT: bnez a0, .LBB7_8 +; RV64-NEXT: .LBB7_5: +; RV64-NEXT: addi a1, a1, 24 +; RV64-NEXT: ld a0, 0(a1) +; RV64-NEXT: sd a4, 0(a3) +; RV64-NEXT: sd a5, 8(a3) +; RV64-NEXT: sd a6, 16(a3) +; RV64-NEXT: sd a0, 24(a3) +; RV64-NEXT: ret +; RV64-NEXT: .LBB7_6: +; RV64-NEXT: addi a5, a1, 8 +; RV64-NEXT: beqz a0, .LBB7_4 +; RV64-NEXT: .LBB7_7: # %entry +; RV64-NEXT: addi a6, a2, 16 +; RV64-NEXT: ld a4, 0(a4) +; RV64-NEXT: ld a5, 0(a5) +; RV64-NEXT: ld a6, 0(a6) +; RV64-NEXT: beqz a0, .LBB7_5 +; RV64-NEXT: .LBB7_8: # %entry +; RV64-NEXT: addi a1, a2, 24 +; RV64-NEXT: ld a0, 0(a1) +; RV64-NEXT: sd a4, 0(a3) +; RV64-NEXT: sd a5, 8(a3) +; RV64-NEXT: sd a6, 16(a3) +; RV64-NEXT: sd a0, 24(a3) +; RV64-NEXT: ret +; +; RV32-LABEL: cmovznz4_fstar_original: +; RV32: # %bb.0: # %entry +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: beqz a0, .LBB7_2 +; RV32-NEXT: # %bb.1: # %entry +; RV32-NEXT: mv a1, a3 +; RV32-NEXT: .LBB7_2: # %entry +; RV32-NEXT: beqz a0, .LBB7_5 +; RV32-NEXT: # %bb.3: # %entry +; RV32-NEXT: addi a5, a3, 8 +; RV32-NEXT: bnez a0, .LBB7_6 +; RV32-NEXT: .LBB7_4: +; RV32-NEXT: addi t0, a2, 16 +; RV32-NEXT: j .LBB7_7 +; RV32-NEXT: .LBB7_5: +; RV32-NEXT: addi a5, a2, 8 +; RV32-NEXT: beqz a0, .LBB7_4 +; RV32-NEXT: .LBB7_6: # %entry +; RV32-NEXT: addi t0, a3, 16 +; RV32-NEXT: .LBB7_7: # %entry +; RV32-NEXT: lw a6, 0(a1) +; RV32-NEXT: lw a1, 4(a1) +; RV32-NEXT: lw a7, 0(a5) +; RV32-NEXT: lw a5, 4(a5) +; RV32-NEXT: lw t1, 0(t0) +; RV32-NEXT: lw t0, 4(t0) +; RV32-NEXT: beqz a0, .LBB7_9 +; RV32-NEXT: # %bb.8: # %entry +; RV32-NEXT: addi a2, a3, 24 +; RV32-NEXT: j .LBB7_10 +; RV32-NEXT: .LBB7_9: +; RV32-NEXT: addi a2, a2, 24 +; RV32-NEXT: .LBB7_10: # %entry +; RV32-NEXT: lw a0, 0(a2) +; RV32-NEXT: lw a2, 4(a2) +; RV32-NEXT: sw a6, 0(a4) +; RV32-NEXT: sw a1, 4(a4) +; RV32-NEXT: sw a7, 8(a4) +; RV32-NEXT: sw a5, 12(a4) +; RV32-NEXT: sw t1, 16(a4) +; RV32-NEXT: sw t0, 20(a4) +; RV32-NEXT: sw a0, 24(a4) +; RV32-NEXT: sw a2, 28(a4) +; RV32-NEXT: ret +entry: + %.not.i = icmp eq i64 %cin, 0 + %0 = load i64, ptr %y, align 8 + %1 = load i64, ptr %x, align 8 + %or = select i1 %.not.i, i64 %1, i64 %0 + %arrayidx4 = getelementptr inbounds nuw i8, ptr %y, i64 8 + %2 = load i64, ptr %arrayidx4, align 8 + %arrayidx6 = getelementptr inbounds nuw i8, ptr %x, i64 8 + %3 = load i64, ptr %arrayidx6, align 8 + %or9 = select i1 %.not.i, i64 %3, i64 %2 + %arrayidx10 = getelementptr inbounds nuw i8, ptr %y, i64 16 + %4 = load i64, ptr %arrayidx10, align 8 + %arrayidx12 = getelementptr inbounds nuw i8, ptr %x, i64 16 + %5 = load i64, ptr %arrayidx12, align 8 + %or15 = select i1 %.not.i, i64 %5, i64 %4 + %arrayidx16 = getelementptr inbounds nuw i8, ptr %y, i64 24 + %6 = load i64, ptr %arrayidx16, align 8 + %arrayidx18 = getelementptr inbounds nuw i8, ptr %x, i64 24 + %7 = load i64, ptr %arrayidx18, align 8 + %or21 = select i1 %.not.i, i64 %7, i64 %6 + store i64 %or, ptr %r, align 8 + %arrayidx23 = getelementptr inbounds nuw i8, ptr %r, i64 8 + store i64 %or9, ptr %arrayidx23, align 8 + %arrayidx24 = getelementptr inbounds nuw i8, ptr %r, i64 16 + store i64 %or15, ptr %arrayidx24, align 8 + %arrayidx25 = getelementptr inbounds nuw i8, ptr %r, i64 24 + store i64 %or21, ptr %arrayidx25, align 8 + ret void +} + +define void @cmovznz4_builtin_ctselect(i64 %cin, ptr %x, ptr %y, ptr %r) { +; RV64-LABEL: cmovznz4_builtin_ctselect: +; RV64: # %bb.0: # %entry +; RV64-NEXT: snez a0, a0 +; RV64-NEXT: ld a4, 0(a2) +; RV64-NEXT: ld a5, 0(a1) +; RV64-NEXT: neg a6, a0 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a4, a6, a4 +; RV64-NEXT: and a5, a0, a5 +; RV64-NEXT: or a4, a5, a4 +; RV64-NEXT: sd a4, 0(a3) +; RV64-NEXT: ld a4, 8(a2) +; RV64-NEXT: ld a5, 8(a1) +; RV64-NEXT: and a4, a6, a4 +; RV64-NEXT: and a5, a0, a5 +; RV64-NEXT: or a4, a5, a4 +; RV64-NEXT: sd a4, 8(a3) +; RV64-NEXT: ld a4, 16(a2) +; RV64-NEXT: ld a5, 16(a1) +; RV64-NEXT: and a4, a6, a4 +; RV64-NEXT: and a5, a0, a5 +; RV64-NEXT: or a4, a5, a4 +; RV64-NEXT: sd a4, 16(a3) +; RV64-NEXT: ld a2, 24(a2) +; RV64-NEXT: ld a1, 24(a1) +; RV64-NEXT: and a2, a6, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: sd a0, 24(a3) +; RV64-NEXT: ret +; +; RV32-LABEL: cmovznz4_builtin_ctselect: +; RV32: # %bb.0: # %entry +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: lw a1, 0(a2) +; RV32-NEXT: lw a5, 4(a2) +; RV32-NEXT: lw a6, 0(a3) +; RV32-NEXT: lw a7, 4(a3) +; RV32-NEXT: snez t0, a0 +; RV32-NEXT: neg a0, t0 +; RV32-NEXT: addi t0, t0, -1 +; RV32-NEXT: and a6, a0, a6 +; RV32-NEXT: and a1, t0, a1 +; RV32-NEXT: and a7, a0, a7 +; RV32-NEXT: and a5, t0, a5 +; RV32-NEXT: or a1, a1, a6 +; RV32-NEXT: or a5, a5, a7 +; RV32-NEXT: sw a1, 0(a4) +; RV32-NEXT: sw a5, 4(a4) +; RV32-NEXT: lw a1, 8(a3) +; RV32-NEXT: lw a5, 8(a2) +; RV32-NEXT: lw a6, 12(a3) +; RV32-NEXT: lw a7, 12(a2) +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: and a5, t0, a5 +; RV32-NEXT: and a6, a0, a6 +; RV32-NEXT: and a7, t0, a7 +; RV32-NEXT: or a1, a5, a1 +; RV32-NEXT: or a5, a7, a6 +; RV32-NEXT: sw a1, 8(a4) +; RV32-NEXT: sw a5, 12(a4) +; RV32-NEXT: lw a1, 16(a3) +; RV32-NEXT: lw a5, 16(a2) +; RV32-NEXT: lw a6, 20(a3) +; RV32-NEXT: lw a7, 20(a2) +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: and a5, t0, a5 +; RV32-NEXT: and a6, a0, a6 +; RV32-NEXT: and a7, t0, a7 +; RV32-NEXT: or a1, a5, a1 +; RV32-NEXT: or a5, a7, a6 +; RV32-NEXT: sw a1, 16(a4) +; RV32-NEXT: sw a5, 20(a4) +; RV32-NEXT: lw a1, 24(a3) +; RV32-NEXT: lw a5, 24(a2) +; RV32-NEXT: lw a3, 28(a3) +; RV32-NEXT: lw a2, 28(a2) +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: and a5, t0, a5 +; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: and a2, t0, a2 +; RV32-NEXT: or a1, a5, a1 +; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: sw a1, 24(a4) +; RV32-NEXT: sw a0, 28(a4) +; RV32-NEXT: ret +entry: + %cmp = icmp eq i64 %cin, 0 + %0 = load i64, ptr %x, align 8 + %1 = load i64, ptr %y, align 8 + %2 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %0, i64 %1) + store i64 %2, ptr %r, align 8 + %arrayidx4 = getelementptr inbounds nuw i8, ptr %x, i64 8 + %3 = load i64, ptr %arrayidx4, align 8 + %arrayidx5 = getelementptr inbounds nuw i8, ptr %y, i64 8 + %4 = load i64, ptr %arrayidx5, align 8 + %5 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %3, i64 %4) + %arrayidx6 = getelementptr inbounds nuw i8, ptr %r, i64 8 + store i64 %5, ptr %arrayidx6, align 8 + %arrayidx8 = getelementptr inbounds nuw i8, ptr %x, i64 16 + %6 = load i64, ptr %arrayidx8, align 8 + %arrayidx9 = getelementptr inbounds nuw i8, ptr %y, i64 16 + %7 = load i64, ptr %arrayidx9, align 8 + %8 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %6, i64 %7) + %arrayidx10 = getelementptr inbounds nuw i8, ptr %r, i64 16 + store i64 %8, ptr %arrayidx10, align 8 + %arrayidx12 = getelementptr inbounds nuw i8, ptr %x, i64 24 + %9 = load i64, ptr %arrayidx12, align 8 + %arrayidx13 = getelementptr inbounds nuw i8, ptr %y, i64 24 + %10 = load i64, ptr %arrayidx13, align 8 + %11 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %9, i64 %10) + %arrayidx14 = getelementptr inbounds nuw i8, ptr %r, i64 24 + store i64 %11, ptr %arrayidx14, align 8 + ret void +} + +; Declare the intrinsics +declare i1 @llvm.ct.select.i1(i1, i1, i1) +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare ptr @llvm.ct.select.p0(i1, ptr, ptr) diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll new file mode 100644 index 0000000000000..27c0d521bb631 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll @@ -0,0 +1,388 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv64 -O3 | FileCheck %s --check-prefix=RV64 +; RUN: llc < %s -mtriple=riscv32 -O3 | FileCheck %s --check-prefix=RV32 + +; Test smin(x, 0) pattern +define i32 @test_ctselect_smin_zero(i32 %x) { +; RV64-LABEL: test_ctselect_smin_zero: +; RV64: # %bb.0: +; RV64-NEXT: sraiw a1, a0, 31 +; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_smin_zero: +; RV32: # %bb.0: +; RV32-NEXT: srai a1, a0, 31 +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: ret + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) + ret i32 %result +} + +; Test smax(x, 0) pattern +define i32 @test_ctselect_smax_zero(i32 %x) { +; RV64-LABEL: test_ctselect_smax_zero: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a1, a0 +; RV64-NEXT: sgtz a1, a1 +; RV64-NEXT: neg a1, a1 +; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_smax_zero: +; RV32: # %bb.0: +; RV32-NEXT: sgtz a1, a0 +; RV32-NEXT: neg a1, a1 +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: ret + %cmp = icmp sgt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) + ret i32 %result +} + +; Test generic smin pattern +define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) { +; RV64-LABEL: test_ctselect_smin_generic: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a2, a1 +; RV64-NEXT: sext.w a3, a0 +; RV64-NEXT: slt a2, a3, a2 +; RV64-NEXT: addi a3, a2, -1 +; RV64-NEXT: neg a2, a2 +; RV64-NEXT: and a1, a3, a1 +; RV64-NEXT: and a0, a2, a0 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_smin_generic: +; RV32: # %bb.0: +; RV32-NEXT: slt a2, a0, a1 +; RV32-NEXT: addi a3, a2, -1 +; RV32-NEXT: neg a2, a2 +; RV32-NEXT: and a1, a3, a1 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %cmp = icmp slt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test generic smax pattern +define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) { +; RV64-LABEL: test_ctselect_smax_generic: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a2, a0 +; RV64-NEXT: sext.w a3, a1 +; RV64-NEXT: slt a2, a3, a2 +; RV64-NEXT: addi a3, a2, -1 +; RV64-NEXT: neg a2, a2 +; RV64-NEXT: and a1, a3, a1 +; RV64-NEXT: and a0, a2, a0 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_smax_generic: +; RV32: # %bb.0: +; RV32-NEXT: slt a2, a1, a0 +; RV32-NEXT: addi a3, a2, -1 +; RV32-NEXT: neg a2, a2 +; RV32-NEXT: and a1, a3, a1 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %cmp = icmp sgt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test umin pattern +define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) { +; RV64-LABEL: test_ctselect_umin_generic: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a2, a1 +; RV64-NEXT: sext.w a3, a0 +; RV64-NEXT: sltu a2, a3, a2 +; RV64-NEXT: addi a3, a2, -1 +; RV64-NEXT: neg a2, a2 +; RV64-NEXT: and a1, a3, a1 +; RV64-NEXT: and a0, a2, a0 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_umin_generic: +; RV32: # %bb.0: +; RV32-NEXT: sltu a2, a0, a1 +; RV32-NEXT: addi a3, a2, -1 +; RV32-NEXT: neg a2, a2 +; RV32-NEXT: and a1, a3, a1 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %cmp = icmp ult i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test umax pattern +define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) { +; RV64-LABEL: test_ctselect_umax_generic: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a2, a0 +; RV64-NEXT: sext.w a3, a1 +; RV64-NEXT: sltu a2, a3, a2 +; RV64-NEXT: addi a3, a2, -1 +; RV64-NEXT: neg a2, a2 +; RV64-NEXT: and a1, a3, a1 +; RV64-NEXT: and a0, a2, a0 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_umax_generic: +; RV32: # %bb.0: +; RV32-NEXT: sltu a2, a1, a0 +; RV32-NEXT: addi a3, a2, -1 +; RV32-NEXT: neg a2, a2 +; RV32-NEXT: and a1, a3, a1 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %cmp = icmp ugt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test abs pattern +define i32 @test_ctselect_abs(i32 %x) { +; RV64-LABEL: test_ctselect_abs: +; RV64: # %bb.0: +; RV64-NEXT: negw a1, a0 +; RV64-NEXT: sraiw a2, a0, 31 +; RV64-NEXT: and a1, a2, a1 +; RV64-NEXT: not a2, a2 +; RV64-NEXT: and a0, a2, a0 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_abs: +; RV32: # %bb.0: +; RV32-NEXT: neg a1, a0 +; RV32-NEXT: srai a2, a0, 31 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: not a2, a2 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: ret + %neg = sub i32 0, %x + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %neg, i32 %x) + ret i32 %result +} + +; Test nabs pattern (negative abs) +define i32 @test_ctselect_nabs(i32 %x) { +; RV64-LABEL: test_ctselect_nabs: +; RV64: # %bb.0: +; RV64-NEXT: negw a1, a0 +; RV64-NEXT: sraiw a2, a0, 31 +; RV64-NEXT: and a0, a2, a0 +; RV64-NEXT: not a2, a2 +; RV64-NEXT: and a1, a2, a1 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_nabs: +; RV32: # %bb.0: +; RV32-NEXT: neg a1, a0 +; RV32-NEXT: srai a2, a0, 31 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: not a2, a2 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %neg = sub i32 0, %x + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %neg) + ret i32 %result +} + +; Test sign extension pattern +define i32 @test_ctselect_sign_extend(i32 %x) { +; RV64-LABEL: test_ctselect_sign_extend: +; RV64: # %bb.0: +; RV64-NEXT: sraiw a0, a0, 31 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_sign_extend: +; RV32: # %bb.0: +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: ret + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0) + ret i32 %result +} + +; Test zero extension pattern +define i32 @test_ctselect_zero_extend(i32 %x) { +; RV64-LABEL: test_ctselect_zero_extend: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: snez a0, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_zero_extend: +; RV32: # %bb.0: +; RV32-NEXT: snez a0, a0 +; RV32-NEXT: ret + %cmp = icmp ne i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0) + ret i32 %result +} + +; Test constant folding with known condition +define i32 @test_ctselect_constant_folding_true(i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_constant_folding_true: +; RV64: # %bb.0: +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_constant_folding_true: +; RV32: # %bb.0: +; RV32-NEXT: ret + %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_constant_folding_false: +; RV64: # %bb.0: +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_constant_folding_false: +; RV32: # %bb.0: +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: ret + %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) + ret i32 %result +} + +; Test with identical operands +define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) { +; RV64-LABEL: test_ctselect_identical_operands: +; RV64: # %bb.0: +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_identical_operands: +; RV32: # %bb.0: +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: ret + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x) + ret i32 %result +} + +; Test with inverted condition +define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_inverted_condition: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a1, a1 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: seqz a0, a0 +; RV64-NEXT: neg a1, a0 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a1, a1, a3 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_inverted_condition: +; RV32: # %bb.0: +; RV32-NEXT: xor a0, a0, a1 +; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: neg a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a1, a1, a3 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %cmp = icmp eq i32 %x, %y + %not_cmp = xor i1 %cmp, true + %result = call i32 @llvm.ct.select.i32(i1 %not_cmp, i32 %a, i32 %b) + ret i32 %result +} + +; Test chain of ct.select operations +define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, i32 %d) { +; RV64-LABEL: test_ctselect_chain: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: andi a1, a1, 1 +; RV64-NEXT: andi a2, a2, 1 +; RV64-NEXT: addi a7, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a4, a7, a4 +; RV64-NEXT: neg a7, a1 +; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: and a0, a0, a3 +; RV64-NEXT: neg a3, a2 +; RV64-NEXT: addi a2, a2, -1 +; RV64-NEXT: and a1, a1, a5 +; RV64-NEXT: or a0, a0, a4 +; RV64-NEXT: and a0, a7, a0 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: and a0, a3, a0 +; RV64-NEXT: and a1, a2, a6 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_chain: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: andi a1, a1, 1 +; RV32-NEXT: andi a2, a2, 1 +; RV32-NEXT: addi a7, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a4, a7, a4 +; RV32-NEXT: neg a7, a1 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: neg a3, a2 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a1, a1, a5 +; RV32-NEXT: or a0, a0, a4 +; RV32-NEXT: and a0, a7, a0 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: and a0, a3, a0 +; RV32-NEXT: and a1, a2, a6 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) + %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c) + %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d) + ret i32 %sel3 +} + +; Test for 64-bit operations (supported on all 64-bit architectures) +define i64 @test_ctselect_i64_smin_zero(i64 %x) { +; RV64-LABEL: test_ctselect_i64_smin_zero: +; RV64: # %bb.0: +; RV64-NEXT: srai a1, a0, 63 +; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_i64_smin_zero: +; RV32: # %bb.0: +; RV32-NEXT: srai a2, a1, 31 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: ret + %cmp = icmp slt i64 %x, 0 + %result = call i64 @llvm.ct.select.i64(i1 %cmp, i64 %x, i64 0) + ret i64 %result +} + +; Declare the intrinsics +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare i64 @llvm.ct.select.i64(i1, i64, i64) diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-vector-rvv.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback-vector-rvv.ll new file mode 100644 index 0000000000000..014d95c3883b9 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-vector-rvv.ll @@ -0,0 +1,804 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v -O3 | FileCheck %s --check-prefix=RV64 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v -O3 | FileCheck %s --check-prefix=RV32 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvl128b -O3 | FileCheck %s --check-prefix=RV32-V128 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvl256b -O3 | FileCheck %s --check-prefix=RV64-V256 + + +; Basic pass-through select on nxv4i32 +define @ctsel_nxv4i32_basic(i1 %cond, %a, %b) { +; RV64-LABEL: ctsel_nxv4i32_basic: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4i32_basic: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4i32_basic: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4i32_basic: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv4i32(i1 %cond, %a, %b) + ret %r +} + +; Select with loads (aligned) +define @ctsel_nxv4i32_load(i1 %cond, ptr %p1, ptr %p2) { +; RV64-LABEL: ctsel_nxv4i32_load: +; RV64: # %bb.0: +; RV64-NEXT: vl2re32.v v8, (a1) +; RV64-NEXT: vl2re32.v v10, (a2) +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4i32_load: +; RV32: # %bb.0: +; RV32-NEXT: vl2re32.v v8, (a1) +; RV32-NEXT: vl2re32.v v10, (a2) +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4i32_load: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: vl2re32.v v8, (a1) +; RV32-V128-NEXT: vl2re32.v v10, (a2) +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4i32_load: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: vl2re32.v v8, (a1) +; RV64-V256-NEXT: vl2re32.v v10, (a2) +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %a = load , ptr %p1, align 16 + %b = load , ptr %p2, align 16 + %r = call @llvm.ct.select.nxv4i32(i1 %cond, %a, %b) + ret %r +} + +; Mixed: do arithmetic first, then select, then store +define void @ctsel_nxv4i32_mixed(i1 %cond, ptr %p1, ptr %p2, ptr %out) { +; RV64-LABEL: ctsel_nxv4i32_mixed: +; RV64: # %bb.0: +; RV64-NEXT: vl2re32.v v8, (a1) +; RV64-NEXT: vl2re32.v v10, (a2) +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vadd.vv v8, v8, v8 +; RV64-NEXT: vadd.vv v10, v10, v10 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vs2r.v v8, (a3) +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4i32_mixed: +; RV32: # %bb.0: +; RV32-NEXT: vl2re32.v v8, (a1) +; RV32-NEXT: vl2re32.v v10, (a2) +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vadd.vv v8, v8, v8 +; RV32-NEXT: vadd.vv v10, v10, v10 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vs2r.v v8, (a3) +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4i32_mixed: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: vl2re32.v v8, (a1) +; RV32-V128-NEXT: vl2re32.v v10, (a2) +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vadd.vv v8, v8, v8 +; RV32-V128-NEXT: vadd.vv v10, v10, v10 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: vs2r.v v8, (a3) +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4i32_mixed: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: vl2re32.v v8, (a1) +; RV64-V256-NEXT: vl2re32.v v10, (a2) +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vadd.vv v8, v8, v8 +; RV64-V256-NEXT: vadd.vv v10, v10, v10 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: vs2r.v v8, (a3) +; RV64-V256-NEXT: ret + %a = load , ptr %p1, align 16 + %b = load , ptr %p2, align 16 + ; avoid scalable vector constants: use %a+%a and %b+%b + %a2 = add %a, %a + %b2 = add %b, %b + %r = call @llvm.ct.select.nxv4i32(i1 %cond, %a2, %b2) + store %r, ptr %out, align 16 + ret void +} + +; Const-true/false fold smoke tests +define @ctsel_nxv4i32_true( %a, %b) { +; RV64-LABEL: ctsel_nxv4i32_true: +; RV64: # %bb.0: +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4i32_true: +; RV32: # %bb.0: +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4i32_true: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4i32_true: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv4i32(i1 true, %a, %b) + ret %r +} + +define @ctsel_nxv4i32_false( %a, %b) { +; RV64-LABEL: ctsel_nxv4i32_false: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64-NEXT: vmv2r.v v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4i32_false: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV32-NEXT: vmv2r.v v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4i32_false: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV32-V128-NEXT: vmv2r.v v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4i32_false: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64-V256-NEXT: vmv2r.v v8, v10 +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv4i32(i1 false, %a, %b) + ret %r +} + +; Chain two selects to ensure masks don’t get merged away +define @ctsel_nxv4i32_chain(i1 %c1, i1 %c2, +; RV64-LABEL: ctsel_nxv4i32_chain: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; RV64-NEXT: vmv.v.i v14, 0 +; RV64-NEXT: andi a1, a1, 1 +; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v16, a0 +; RV64-NEXT: vmsne.vi v0, v16, 0 +; RV64-NEXT: vmv.v.x v16, a1 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmerge.vim v18, v14, -1, v0 +; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV64-NEXT: vmsne.vi v0, v16, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmerge.vim v14, v14, -1, v0 +; RV64-NEXT: vand.vv v8, v18, v8 +; RV64-NEXT: vnot.v v16, v18 +; RV64-NEXT: vand.vv v10, v16, v10 +; RV64-NEXT: vnot.v v16, v14 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vand.vv v8, v14, v8 +; RV64-NEXT: vand.vv v10, v16, v12 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4i32_chain: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v14, 0 +; RV32-NEXT: andi a1, a1, 1 +; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v16, a0 +; RV32-NEXT: vmsne.vi v0, v16, 0 +; RV32-NEXT: vmv.v.x v16, a1 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmerge.vim v18, v14, -1, v0 +; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32-NEXT: vmsne.vi v0, v16, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmerge.vim v14, v14, -1, v0 +; RV32-NEXT: vand.vv v8, v18, v8 +; RV32-NEXT: vnot.v v16, v18 +; RV32-NEXT: vand.vv v10, v16, v10 +; RV32-NEXT: vnot.v v16, v14 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vand.vv v8, v14, v8 +; RV32-NEXT: vand.vv v10, v16, v12 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4i32_chain: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v14, 0 +; RV32-V128-NEXT: andi a1, a1, 1 +; RV32-V128-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32-V128-NEXT: vmv.v.x v16, a0 +; RV32-V128-NEXT: vmsne.vi v0, v16, 0 +; RV32-V128-NEXT: vmv.v.x v16, a1 +; RV32-V128-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmerge.vim v18, v14, -1, v0 +; RV32-V128-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32-V128-NEXT: vmsne.vi v0, v16, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmerge.vim v14, v14, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v18, v8 +; RV32-V128-NEXT: vnot.v v16, v18 +; RV32-V128-NEXT: vand.vv v10, v16, v10 +; RV32-V128-NEXT: vnot.v v16, v14 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: vand.vv v8, v14, v8 +; RV32-V128-NEXT: vand.vv v10, v16, v12 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4i32_chain: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v14, 0 +; RV64-V256-NEXT: andi a1, a1, 1 +; RV64-V256-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV64-V256-NEXT: vmv.v.x v16, a0 +; RV64-V256-NEXT: vmsne.vi v0, v16, 0 +; RV64-V256-NEXT: vmv.v.x v16, a1 +; RV64-V256-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmerge.vim v18, v14, -1, v0 +; RV64-V256-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV64-V256-NEXT: vmsne.vi v0, v16, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmerge.vim v14, v14, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v18, v8 +; RV64-V256-NEXT: vnot.v v16, v18 +; RV64-V256-NEXT: vand.vv v10, v16, v10 +; RV64-V256-NEXT: vnot.v v16, v14 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: vand.vv v8, v14, v8 +; RV64-V256-NEXT: vand.vv v10, v16, v12 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %a, + %b, + %c) { + %t = call @llvm.ct.select.nxv4i32(i1 %c1, %a, %b) + %r = call @llvm.ct.select.nxv4i32(i1 %c2, %t, %c) + ret %r +} + +; A different element width +define @ctsel_nxv8i16_basic(i1 %cond, %a, %b) { +; RV64-LABEL: ctsel_nxv8i16_basic: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv8i16_basic: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv8i16_basic: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv8i16_basic: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv8i16(i1 %cond, %a, %b) + ret %r +} + +define @ctsel_nxv16i8_basic(i1 %cond, %a, %b) { +; RV64-LABEL: ctsel_nxv16i8_basic: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv16i8_basic: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv16i8_basic: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv16i8_basic: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv16i8(i1 %cond, %a, %b) + ret %r +} + +; 64-bit elements (useful on RV64) +define @ctsel_nxv2i64_basic(i1 %cond, %a, %b) { +; RV64-LABEL: ctsel_nxv2i64_basic: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv2i64_basic: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv2i64_basic: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv2i64_basic: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv2i64(i1 %cond, %a, %b) + ret %r +} + +; Floating-point scalable vectors (bitcasted in your fallback) +define @ctsel_nxv4f32_basic(i1 %cond, %a, %b) { +; RV64-LABEL: ctsel_nxv4f32_basic: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4f32_basic: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4f32_basic: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4f32_basic: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv4f32(i1 %cond, %a, %b) + ret %r +} + +; FP arithmetic around select +define @ctsel_nxv4f32_arith(i1 %cond, %x, %y) { +; RV64-LABEL: ctsel_nxv4f32_arith: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV64-NEXT: vfadd.vv v12, v8, v10 +; RV64-NEXT: vfsub.vv v8, v8, v10 +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v10, a0 +; RV64-NEXT: vmsne.vi v0, v10, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vmerge.vim v10, v10, -1, v0 +; RV64-NEXT: vand.vv v12, v10, v12 +; RV64-NEXT: vnot.v v10, v10 +; RV64-NEXT: vand.vv v8, v10, v8 +; RV64-NEXT: vor.vv v8, v12, v8 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4f32_arith: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV32-NEXT: vfadd.vv v12, v8, v10 +; RV32-NEXT: vfsub.vv v8, v8, v10 +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: vmsne.vi v0, v10, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vmerge.vim v10, v10, -1, v0 +; RV32-NEXT: vand.vv v12, v10, v12 +; RV32-NEXT: vnot.v v10, v10 +; RV32-NEXT: vand.vv v8, v10, v8 +; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4f32_arith: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vfadd.vv v12, v8, v10 +; RV32-V128-NEXT: vfsub.vv v8, v8, v10 +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32-V128-NEXT: vmv.v.x v10, a0 +; RV32-V128-NEXT: vmsne.vi v0, v10, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v10, 0 +; RV32-V128-NEXT: vmerge.vim v10, v10, -1, v0 +; RV32-V128-NEXT: vand.vv v12, v10, v12 +; RV32-V128-NEXT: vnot.v v10, v10 +; RV32-V128-NEXT: vand.vv v8, v10, v8 +; RV32-V128-NEXT: vor.vv v8, v12, v8 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4f32_arith: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vfadd.vv v12, v8, v10 +; RV64-V256-NEXT: vfsub.vv v8, v8, v10 +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV64-V256-NEXT: vmv.v.x v10, a0 +; RV64-V256-NEXT: vmsne.vi v0, v10, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v10, 0 +; RV64-V256-NEXT: vmerge.vim v10, v10, -1, v0 +; RV64-V256-NEXT: vand.vv v12, v10, v12 +; RV64-V256-NEXT: vnot.v v10, v10 +; RV64-V256-NEXT: vand.vv v8, v10, v8 +; RV64-V256-NEXT: vor.vv v8, v12, v8 +; RV64-V256-NEXT: ret + %sum = fadd %x, %y + %diff = fsub %x, %y + %r = call @llvm.ct.select.nxv4f32(i1 %cond, %sum, %diff) + ret %r +} + +define @ctsel_nxv2f64_basic(i1 %cond, %a, %b) { +; RV64-LABEL: ctsel_nxv2f64_basic: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv2f64_basic: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv2f64_basic: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv2f64_basic: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv2f64(i1 %cond, %a, %b) + ret %r +} + +declare @llvm.ct.select.nxv4i32(i1, , ) +declare @llvm.ct.select.nxv8i16(i1, , ) +declare @llvm.ct.select.nxv16i8(i1, , ) +declare @llvm.ct.select.nxv2i64(i1, , ) +declare @llvm.ct.select.nxv4f32(i1, , ) +declare @llvm.ct.select.nxv2f64(i1, ,) diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll new file mode 100644 index 0000000000000..1625c8db2d85c --- /dev/null +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll @@ -0,0 +1,600 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv64 -O3 | FileCheck %s --check-prefix=RV64 +; RUN: llc < %s -mtriple=riscv32 -O3 | FileCheck %s --check-prefix=RV32 + +; Test basic ct.select functionality for scalar types +define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) { +; RV64-LABEL: test_ctselect_i8: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_i8: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: ret + %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) + ret i8 %result +} + +define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) { +; RV64-LABEL: test_ctselect_i16: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_i16: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: ret + %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) + ret i16 %result +} + +define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_i32: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_i32: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: ret + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { +; RV64-LABEL: test_ctselect_i64: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_i64: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a5, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a3, a5, a3 +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: and a4, a5, a4 +; RV32-NEXT: and a2, a0, a2 +; RV32-NEXT: or a0, a1, a3 +; RV32-NEXT: or a1, a2, a4 +; RV32-NEXT: ret + %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b) + ret i64 %result +} + +define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) { +; RV64-LABEL: test_ctselect_ptr: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_ptr: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: ret + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) + ret ptr %result +} + +; Test with constant conditions +define i32 @test_ctselect_const_true(i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_const_true: +; RV64: # %bb.0: +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_const_true: +; RV32: # %bb.0: +; RV32-NEXT: ret + %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_const_false(i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_const_false: +; RV64: # %bb.0: +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_const_false: +; RV32: # %bb.0: +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: ret + %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) + ret i32 %result +} + +; Test with comparison conditions +define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_icmp_eq: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a1, a1 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: snez a0, a0 +; RV64-NEXT: neg a1, a0 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a1, a1, a3 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_icmp_eq: +; RV32: # %bb.0: +; RV32-NEXT: xor a0, a0, a1 +; RV32-NEXT: snez a0, a0 +; RV32-NEXT: neg a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a1, a1, a3 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %cond = icmp eq i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_icmp_ne: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a1, a1 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: seqz a0, a0 +; RV64-NEXT: neg a1, a0 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a1, a1, a3 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_icmp_ne: +; RV32: # %bb.0: +; RV32-NEXT: xor a0, a0, a1 +; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: neg a1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a1, a1, a3 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %cond = icmp ne i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_icmp_slt: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a1, a1 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: slt a0, a0, a1 +; RV64-NEXT: addi a1, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a1, a1, a3 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_icmp_slt: +; RV32: # %bb.0: +; RV32-NEXT: slt a0, a0, a1 +; RV32-NEXT: addi a1, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a1, a1, a3 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %cond = icmp slt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_icmp_ult: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a1, a1 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: sltu a0, a0, a1 +; RV64-NEXT: addi a1, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a1, a1, a3 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_icmp_ult: +; RV32: # %bb.0: +; RV32-NEXT: sltu a0, a0, a1 +; RV32-NEXT: addi a1, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a1, a1, a3 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %cond = icmp ult i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test with memory operands +define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { +; RV64-LABEL: test_ctselect_load: +; RV64: # %bb.0: +; RV64-NEXT: lw a1, 0(a1) +; RV64-NEXT: lw a2, 0(a2) +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_load: +; RV32: # %bb.0: +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: lw a2, 0(a2) +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: ret + %a = load i32, ptr %p1 + %b = load i32, ptr %p2 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test nested ctselect calls +define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { +; RV64-LABEL: test_ctselect_nested: +; RV64: # %bb.0: +; RV64-NEXT: andi a1, a1, 1 +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a5, a1, -1 +; RV64-NEXT: neg a1, a1 +; RV64-NEXT: and a3, a5, a3 +; RV64-NEXT: neg a5, a0 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a1, a1, a2 +; RV64-NEXT: or a1, a1, a3 +; RV64-NEXT: and a1, a5, a1 +; RV64-NEXT: and a0, a0, a4 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_nested: +; RV32: # %bb.0: +; RV32-NEXT: andi a1, a1, 1 +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a5, a1, -1 +; RV32-NEXT: neg a1, a1 +; RV32-NEXT: and a3, a5, a3 +; RV32-NEXT: neg a5, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a1, a1, a2 +; RV32-NEXT: or a1, a1, a3 +; RV32-NEXT: and a1, a5, a1 +; RV32-NEXT: and a0, a0, a4 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: ret + %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b) + %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c) + ret i32 %result +} + +; Test float (32-bit) +define float @test_ctselect_f32(i1 %cond, float %a, float %b) { +; RV64-LABEL: test_ctselect_f32: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_f32: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: ret + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +; Test double (64-bit) +define double @test_ctselect_f64(i1 %cond, double %a, double %b) { +; RV64-LABEL: test_ctselect_f64: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_f64: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a5, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a3, a5, a3 +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: and a4, a5, a4 +; RV32-NEXT: and a2, a0, a2 +; RV32-NEXT: or a0, a1, a3 +; RV32-NEXT: or a1, a2, a4 +; RV32-NEXT: ret + %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) + ret double %result +} + + +; Test chained float selects +define float @test_ctselect_f32_chain(i1 %cond1, i1 %cond2, float %a, float %b, float %c) { +; RV64-LABEL: test_ctselect_f32_chain: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: andi a1, a1, 1 +; RV64-NEXT: addi a5, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a3, a5, a3 +; RV64-NEXT: neg a5, a1 +; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a0, a3 +; RV64-NEXT: and a0, a5, a0 +; RV64-NEXT: and a1, a1, a4 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_f32_chain: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: andi a1, a1, 1 +; RV32-NEXT: addi a5, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a3, a5, a3 +; RV32-NEXT: neg a5, a1 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a0, a3 +; RV32-NEXT: and a0, a5, a0 +; RV32-NEXT: and a1, a1, a4 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %tmp = call float @llvm.ct.select.f32(i1 %cond1, float %a, float %b) + %result = call float @llvm.ct.select.f32(i1 %cond2, float %tmp, float %c) + ret float %result +} + +; Test with float load +define float @test_ctselect_f32_load(i1 %cond, ptr %p1, ptr %p2) { +; RV64-LABEL: test_ctselect_f32_load: +; RV64: # %bb.0: +; RV64-NEXT: lw a1, 0(a1) +; RV64-NEXT: lw a2, 0(a2) +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_f32_load: +; RV32: # %bb.0: +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: lw a2, 0(a2) +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: ret + %a = load float, ptr %p1 + %b = load float, ptr %p2 + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +; Test with double load +define double @test_ctselect_f64_load(i1 %cond, ptr %p1, ptr %p2) { +; RV64-LABEL: test_ctselect_f64_load: +; RV64: # %bb.0: +; RV64-NEXT: ld a1, 0(a1) +; RV64-NEXT: ld a2, 0(a2) +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_f64_load: +; RV32: # %bb.0: +; RV32-NEXT: lw a3, 0(a1) +; RV32-NEXT: lw a1, 4(a1) +; RV32-NEXT: lw a4, 0(a2) +; RV32-NEXT: lw a2, 4(a2) +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a5, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a4, a5, a4 +; RV32-NEXT: and a3, a0, a3 +; RV32-NEXT: and a2, a5, a2 +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: or a0, a3, a4 +; RV32-NEXT: or a1, a1, a2 +; RV32-NEXT: ret + %a = load double, ptr %p1 + %b = load double, ptr %p2 + %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) + ret double %result +} + +; Test mixed with arithmetic +define float @test_ctselect_f32_arithmetic(i1 %cond, float %x, float %y) { +; RV64-LABEL: test_ctselect_f32_arithmetic: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -48 +; RV64-NEXT: .cfi_def_cfa_offset 48 +; RV64-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s2, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s3, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: .cfi_offset s1, -24 +; RV64-NEXT: .cfi_offset s2, -32 +; RV64-NEXT: .cfi_offset s3, -40 +; RV64-NEXT: mv s0, a2 +; RV64-NEXT: mv s1, a1 +; RV64-NEXT: mv s2, a0 +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: mv a1, a2 +; RV64-NEXT: call __addsf3 +; RV64-NEXT: mv s3, a0 +; RV64-NEXT: mv a0, s1 +; RV64-NEXT: mv a1, s0 +; RV64-NEXT: call __subsf3 +; RV64-NEXT: andi a1, s2, 1 +; RV64-NEXT: neg a2, a1 +; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: and a2, a2, s3 +; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: or a0, a2, a0 +; RV64-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s2, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s3, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: .cfi_restore ra +; RV64-NEXT: .cfi_restore s0 +; RV64-NEXT: .cfi_restore s1 +; RV64-NEXT: .cfi_restore s2 +; RV64-NEXT: .cfi_restore s3 +; RV64-NEXT: addi sp, sp, 48 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_f32_arithmetic: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: .cfi_offset s0, -8 +; RV32-NEXT: .cfi_offset s1, -12 +; RV32-NEXT: .cfi_offset s2, -16 +; RV32-NEXT: .cfi_offset s3, -20 +; RV32-NEXT: mv s0, a2 +; RV32-NEXT: mv s1, a1 +; RV32-NEXT: mv s2, a0 +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: call __addsf3 +; RV32-NEXT: mv s3, a0 +; RV32-NEXT: mv a0, s1 +; RV32-NEXT: mv a1, s0 +; RV32-NEXT: call __subsf3 +; RV32-NEXT: andi a1, s2, 1 +; RV32-NEXT: neg a2, a1 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a2, a2, s3 +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 16(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s3, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: .cfi_restore ra +; RV32-NEXT: .cfi_restore s0 +; RV32-NEXT: .cfi_restore s1 +; RV32-NEXT: .cfi_restore s2 +; RV32-NEXT: .cfi_restore s3 +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret + %sum = fadd float %x, %y + %diff = fsub float %x, %y + %result = call float @llvm.ct.select.f32(i1 %cond, float %sum, float %diff) + ret float %result +} + +; Declare the intrinsics +; Declare the intrinsics +declare i8 @llvm.ct.select.i8(i1, i8, i8) +declare i16 @llvm.ct.select.i16(i1, i16, i16) +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare i64 @llvm.ct.select.i64(i1, i64, i64) +declare ptr @llvm.ct.select.p0(i1, ptr, ptr) +declare float @llvm.ct.select.f32(i1, float, float) +declare double @llvm.ct.select.f64(i1, double, double) diff --git a/llvm/test/CodeGen/RISCV/ctselect-side-effects.ll b/llvm/test/CodeGen/RISCV/ctselect-side-effects.ll new file mode 100644 index 0000000000000..60f6350d6508d --- /dev/null +++ b/llvm/test/CodeGen/RISCV/ctselect-side-effects.ll @@ -0,0 +1,177 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv64 -O3 -filetype=asm | FileCheck %s --check-prefix=RV64 +; RUN: llc < %s -mtriple=riscv32 -O3 -filetype=asm | FileCheck %s --check-prefix=RV32 + +; Test 1: Basic optimizations should still work +define i32 @test_basic_opts(i32 %x) { +; RV64-LABEL: test_basic_opts: +; RV64: # %bb.0: +; RV64-NEXT: ret +; +; RV32-LABEL: test_basic_opts: +; RV32: # %bb.0: +; RV32-NEXT: ret + %a = or i32 %x, 0 ; Should eliminate + %b = and i32 %a, -1 ; Should eliminate + %c = xor i32 %b, 0 ; Should eliminate + ret i32 %c +} + +; Test 2: Constant folding should work +define i32 @test_constant_fold() { +; RV64-LABEL: test_constant_fold: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_constant_fold: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 0 +; RV32-NEXT: ret + %a = xor i32 -1, -1 ; Should fold to 0 + ret i32 %a +} + +; Test 3: Protected pattern should NOT have branches +define i32 @test_protected_no_branch(i1 %cond, i32 %a, i32 %b) { +; RV64-LABEL: test_protected_no_branch: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: addi a3, a0, -1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_protected_no_branch: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: ret + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test 4: Explicit branch should still generate branches +define i32 @test_explicit_branch(i1 %cond, i32 %a, i32 %b) { +; RV64-LABEL: test_explicit_branch: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: beqz a0, .LBB3_2 +; RV64-NEXT: # %bb.1: # %true +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: ret +; RV64-NEXT: .LBB3_2: # %false +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_explicit_branch: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: beqz a0, .LBB3_2 +; RV32-NEXT: # %bb.1: # %true +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: ret +; RV32-NEXT: .LBB3_2: # %false +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: ret + br i1 %cond, label %true, label %false +true: + ret i32 %a +false: + ret i32 %b +} + +; Test 5: Regular select (not ct.select) - whatever wasm wants to do +define i32 @test_regular_select(i1 %cond, i32 %a, i32 %b) { +; RV64-LABEL: test_regular_select: +; RV64: # %bb.0: +; RV64-NEXT: andi a3, a0, 1 +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: bnez a3, .LBB4_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: .LBB4_2: +; RV64-NEXT: ret +; +; RV32-LABEL: test_regular_select: +; RV32: # %bb.0: +; RV32-NEXT: andi a3, a0, 1 +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: bnez a3, .LBB4_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: .LBB4_2: +; RV32-NEXT: ret + %result = select i1 %cond, i32 %a, i32 %b + ret i32 %result +} + +; Test if XOR with all-ones still gets optimized +define i32 @test_xor_all_ones() { +; RV64-LABEL: test_xor_all_ones: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_xor_all_ones: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 0 +; RV32-NEXT: ret + %xor1 = xor i32 -1, -1 ; Should optimize to 0 + ret i32 %xor1 +} + +define i32 @test_xor_same_value(i32 %x) { +; RV64-LABEL: test_xor_same_value: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_xor_same_value: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 0 +; RV32-NEXT: ret + %xor2 = xor i32 %x, %x ; Should optimize to 0 + ret i32 %xor2 +} + +define i32 @test_normal_ops(i32 %x) { +; RV64-LABEL: test_normal_ops: +; RV64: # %bb.0: +; RV64-NEXT: ret +; +; RV32-LABEL: test_normal_ops: +; RV32: # %bb.0: +; RV32-NEXT: ret + %or1 = or i32 %x, 0 ; Should optimize to %x + %and1 = and i32 %or1, -1 ; Should optimize to %x + %xor1 = xor i32 %and1, 0 ; Should optimize to %x + ret i32 %xor1 +} + +; This simulates what the reviewer is worried about +define i32 @test_xor_with_const_operands() { +; RV64-LABEL: test_xor_with_const_operands: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_xor_with_const_operands: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 0 +; RV32-NEXT: ret + %a = xor i32 -1, -1 ; -1 ^ -1 should become 0 + %b = xor i32 0, 0 ; 0 ^ 0 should become 0 + %c = xor i32 42, 42 ; 42 ^ 42 should become 0 + %result = or i32 %a, %b + %final = or i32 %result, %c + ret i32 %final ; Should optimize to 0 +} + +declare i32 @llvm.ct.select.i32(i1, i32, i32) diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll new file mode 100644 index 0000000000000..19f01b37ba8cb --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll @@ -0,0 +1,663 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=wasm32-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W32 +; RUN: llc < %s -mtriple=wasm64-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W64 + +; Test with small integer types +define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) { +; W32-LABEL: test_ctselect_i1: +; W32: .functype test_ctselect_i1 (i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_i1: +; W64: .functype test_ctselect_i1 (i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 2 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b) + ret i1 %result +} + +; Test with extremal values +define i32 @test_ctselect_extremal_values(i1 %cond) { +; W32-LABEL: test_ctselect_extremal_values: +; W32: .functype test_ctselect_extremal_values (i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub +; W32-NEXT: i32.const 2147483647 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: i32.const -2147483648 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_extremal_values: +; W64: .functype test_ctselect_extremal_values (i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub +; W64-NEXT: i32.const 2147483647 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.add +; W64-NEXT: i32.const -2147483648 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648) + ret i32 %result +} + +; Test with null pointers +define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { +; W32-LABEL: test_ctselect_null_ptr: +; W32: .functype test_ctselect_null_ptr (i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.sub +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_null_ptr: +; W64: .functype test_ctselect_null_ptr (i32, i64) -> (i64) +; W64-NEXT: # %bb.0: +; W64-NEXT: i64.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i64.extend_i32_u +; W64-NEXT: i64.const 1 +; W64-NEXT: i64.and +; W64-NEXT: i64.sub +; W64-NEXT: local.get 1 +; W64-NEXT: i64.and +; W64-NEXT: # fallthrough-return + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null) + ret ptr %result +} + +; Test with function pointers +define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) { +; W32-LABEL: test_ctselect_function_ptr: +; W32: .functype test_ctselect_function_ptr (i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_function_ptr: +; W64: .functype test_ctselect_function_ptr (i32, i64, i64) -> (i64) +; W64-NEXT: .local i64 +; W64-NEXT: # %bb.0: +; W64-NEXT: i64.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i64.extend_i32_u +; W64-NEXT: i64.const 1 +; W64-NEXT: i64.and +; W64-NEXT: local.tee 3 +; W64-NEXT: i64.sub +; W64-NEXT: local.get 1 +; W64-NEXT: i64.and +; W64-NEXT: local.get 3 +; W64-NEXT: i64.const -1 +; W64-NEXT: i64.add +; W64-NEXT: local.get 2 +; W64-NEXT: i64.and +; W64-NEXT: i64.or +; W64-NEXT: # fallthrough-return + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2) + ret ptr %result +} + +; Test with condition from icmp on pointers +define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) { +; W32-LABEL: test_ctselect_ptr_cmp: +; W32: .functype test_ctselect_ptr_cmp (i32, i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.eq +; W32-NEXT: i32.select +; W32-NEXT: local.tee 1 +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 3 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_ptr_cmp: +; W64: .functype test_ctselect_ptr_cmp (i64, i64, i64, i64) -> (i64) +; W64-NEXT: # %bb.0: +; W64-NEXT: i64.const -1 +; W64-NEXT: i64.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i64.eq +; W64-NEXT: i64.select +; W64-NEXT: local.tee 1 +; W64-NEXT: local.get 2 +; W64-NEXT: i64.and +; W64-NEXT: local.get 1 +; W64-NEXT: i64.const -1 +; W64-NEXT: i64.xor +; W64-NEXT: local.get 3 +; W64-NEXT: i64.and +; W64-NEXT: i64.or +; W64-NEXT: # fallthrough-return + %cmp = icmp eq ptr %p1, %p2 + %result = call ptr @llvm.ct.select.p0(i1 %cmp, ptr %a, ptr %b) + ret ptr %result +} + +; Test with struct pointer types +%struct.pair = type { i32, i32 } + +define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) { +; W32-LABEL: test_ctselect_struct_ptr: +; W32: .functype test_ctselect_struct_ptr (i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_struct_ptr: +; W64: .functype test_ctselect_struct_ptr (i32, i64, i64) -> (i64) +; W64-NEXT: .local i64 +; W64-NEXT: # %bb.0: +; W64-NEXT: i64.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i64.extend_i32_u +; W64-NEXT: i64.const 1 +; W64-NEXT: i64.and +; W64-NEXT: local.tee 3 +; W64-NEXT: i64.sub +; W64-NEXT: local.get 1 +; W64-NEXT: i64.and +; W64-NEXT: local.get 3 +; W64-NEXT: i64.const -1 +; W64-NEXT: i64.add +; W64-NEXT: local.get 2 +; W64-NEXT: i64.and +; W64-NEXT: i64.or +; W64-NEXT: # fallthrough-return + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) + ret ptr %result +} + +; Test with deeply nested conditions +define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { +; W32-LABEL: test_ctselect_deeply_nested: +; W32: .functype test_ctselect_deeply_nested (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 3 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 3 +; W32-NEXT: i32.sub +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 2 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 2 +; W32-NEXT: i32.sub +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 1 +; W32-NEXT: i32.sub +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub +; W32-NEXT: local.get 4 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 5 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 6 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: i32.and +; W32-NEXT: local.get 2 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 7 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: i32.and +; W32-NEXT: local.get 3 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 8 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_deeply_nested: +; W64: .functype test_ctselect_deeply_nested (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 3 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: local.tee 3 +; W64-NEXT: i32.sub +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 2 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: local.tee 2 +; W64-NEXT: i32.sub +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: local.tee 1 +; W64-NEXT: i32.sub +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub +; W64-NEXT: local.get 4 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.add +; W64-NEXT: local.get 5 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: i32.and +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.add +; W64-NEXT: local.get 6 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: i32.and +; W64-NEXT: local.get 2 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.add +; W64-NEXT: local.get 7 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: i32.and +; W64-NEXT: local.get 3 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.add +; W64-NEXT: local.get 8 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) + %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c) + %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d) + %sel4 = call i32 @llvm.ct.select.i32(i1 %c4, i32 %sel3, i32 %e) + ret i32 %sel4 +} + +; This test demonstrates the FStar cmovznz4 pattern using ct.select +; Based on https://godbolt.org/z/6Kb71Ks7z +; Shows that NoMerge flag prevents DAG optimization from introducing branches +define void @cmovznz4_fstar_original(i64 %cin, ptr %x, ptr %y, ptr %r) { +; W32-LABEL: cmovznz4_fstar_original: +; W32: .functype cmovznz4_fstar_original (i64, i32, i32, i32) -> () +; W32-NEXT: .local i32, i64, i64 +; W32-NEXT: # %bb.0: # %entry +; W32-NEXT: local.get 1 +; W32-NEXT: local.get 2 +; W32-NEXT: local.get 0 +; W32-NEXT: i64.eqz +; W32-NEXT: local.tee 4 +; W32-NEXT: i32.select +; W32-NEXT: i64.load 0 +; W32-NEXT: local.set 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const 8 +; W32-NEXT: i32.add +; W32-NEXT: local.get 2 +; W32-NEXT: i32.const 8 +; W32-NEXT: i32.add +; W32-NEXT: local.get 4 +; W32-NEXT: i32.select +; W32-NEXT: i64.load 0 +; W32-NEXT: local.set 5 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const 16 +; W32-NEXT: i32.add +; W32-NEXT: local.get 2 +; W32-NEXT: i32.const 16 +; W32-NEXT: i32.add +; W32-NEXT: local.get 4 +; W32-NEXT: i32.select +; W32-NEXT: i64.load 0 +; W32-NEXT: local.set 6 +; W32-NEXT: local.get 3 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const 24 +; W32-NEXT: i32.add +; W32-NEXT: local.get 2 +; W32-NEXT: i32.const 24 +; W32-NEXT: i32.add +; W32-NEXT: local.get 4 +; W32-NEXT: i32.select +; W32-NEXT: i64.load 0 +; W32-NEXT: i64.store 24 +; W32-NEXT: local.get 3 +; W32-NEXT: local.get 6 +; W32-NEXT: i64.store 16 +; W32-NEXT: local.get 3 +; W32-NEXT: local.get 5 +; W32-NEXT: i64.store 8 +; W32-NEXT: local.get 3 +; W32-NEXT: local.get 0 +; W32-NEXT: i64.store 0 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: cmovznz4_fstar_original: +; W64: .functype cmovznz4_fstar_original (i64, i64, i64, i64) -> () +; W64-NEXT: .local i32, i64, i64 +; W64-NEXT: # %bb.0: # %entry +; W64-NEXT: local.get 1 +; W64-NEXT: local.get 2 +; W64-NEXT: local.get 0 +; W64-NEXT: i64.eqz +; W64-NEXT: local.tee 4 +; W64-NEXT: i64.select +; W64-NEXT: i64.load 0 +; W64-NEXT: local.set 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i64.const 8 +; W64-NEXT: i64.add +; W64-NEXT: local.get 2 +; W64-NEXT: i64.const 8 +; W64-NEXT: i64.add +; W64-NEXT: local.get 4 +; W64-NEXT: i64.select +; W64-NEXT: i64.load 0 +; W64-NEXT: local.set 5 +; W64-NEXT: local.get 1 +; W64-NEXT: i64.const 16 +; W64-NEXT: i64.add +; W64-NEXT: local.get 2 +; W64-NEXT: i64.const 16 +; W64-NEXT: i64.add +; W64-NEXT: local.get 4 +; W64-NEXT: i64.select +; W64-NEXT: i64.load 0 +; W64-NEXT: local.set 6 +; W64-NEXT: local.get 3 +; W64-NEXT: local.get 1 +; W64-NEXT: i64.const 24 +; W64-NEXT: i64.add +; W64-NEXT: local.get 2 +; W64-NEXT: i64.const 24 +; W64-NEXT: i64.add +; W64-NEXT: local.get 4 +; W64-NEXT: i64.select +; W64-NEXT: i64.load 0 +; W64-NEXT: i64.store 24 +; W64-NEXT: local.get 3 +; W64-NEXT: local.get 6 +; W64-NEXT: i64.store 16 +; W64-NEXT: local.get 3 +; W64-NEXT: local.get 5 +; W64-NEXT: i64.store 8 +; W64-NEXT: local.get 3 +; W64-NEXT: local.get 0 +; W64-NEXT: i64.store 0 +; W64-NEXT: # fallthrough-return +entry: + %.not.i = icmp eq i64 %cin, 0 + %0 = load i64, ptr %y, align 8 + %1 = load i64, ptr %x, align 8 + %or = select i1 %.not.i, i64 %1, i64 %0 + %arrayidx4 = getelementptr inbounds nuw i8, ptr %y, i64 8 + %2 = load i64, ptr %arrayidx4, align 8 + %arrayidx6 = getelementptr inbounds nuw i8, ptr %x, i64 8 + %3 = load i64, ptr %arrayidx6, align 8 + %or9 = select i1 %.not.i, i64 %3, i64 %2 + %arrayidx10 = getelementptr inbounds nuw i8, ptr %y, i64 16 + %4 = load i64, ptr %arrayidx10, align 8 + %arrayidx12 = getelementptr inbounds nuw i8, ptr %x, i64 16 + %5 = load i64, ptr %arrayidx12, align 8 + %or15 = select i1 %.not.i, i64 %5, i64 %4 + %arrayidx16 = getelementptr inbounds nuw i8, ptr %y, i64 24 + %6 = load i64, ptr %arrayidx16, align 8 + %arrayidx18 = getelementptr inbounds nuw i8, ptr %x, i64 24 + %7 = load i64, ptr %arrayidx18, align 8 + %or21 = select i1 %.not.i, i64 %7, i64 %6 + store i64 %or, ptr %r, align 8 + %arrayidx23 = getelementptr inbounds nuw i8, ptr %r, i64 8 + store i64 %or9, ptr %arrayidx23, align 8 + %arrayidx24 = getelementptr inbounds nuw i8, ptr %r, i64 16 + store i64 %or15, ptr %arrayidx24, align 8 + %arrayidx25 = getelementptr inbounds nuw i8, ptr %r, i64 24 + store i64 %or21, ptr %arrayidx25, align 8 + ret void +} + +define void @cmovznz4_builtin_ctselect(i64 %cin, ptr %x, ptr %y, ptr %r) { +; W32-LABEL: cmovznz4_builtin_ctselect: +; W32: .functype cmovznz4_builtin_ctselect (i64, i32, i32, i32) -> () +; W32-NEXT: .local i64 +; W32-NEXT: # %bb.0: # %entry +; W32-NEXT: local.get 3 +; W32-NEXT: i64.const -1 +; W32-NEXT: i64.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i64.eqz +; W32-NEXT: i64.select +; W32-NEXT: local.tee 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i64.load 0 +; W32-NEXT: i64.and +; W32-NEXT: local.get 0 +; W32-NEXT: i64.const -1 +; W32-NEXT: i64.xor +; W32-NEXT: local.tee 4 +; W32-NEXT: local.get 2 +; W32-NEXT: i64.load 0 +; W32-NEXT: i64.and +; W32-NEXT: i64.or +; W32-NEXT: i64.store 0 +; W32-NEXT: local.get 3 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i64.load 8 +; W32-NEXT: i64.and +; W32-NEXT: local.get 4 +; W32-NEXT: local.get 2 +; W32-NEXT: i64.load 8 +; W32-NEXT: i64.and +; W32-NEXT: i64.or +; W32-NEXT: i64.store 8 +; W32-NEXT: local.get 3 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i64.load 16 +; W32-NEXT: i64.and +; W32-NEXT: local.get 4 +; W32-NEXT: local.get 2 +; W32-NEXT: i64.load 16 +; W32-NEXT: i64.and +; W32-NEXT: i64.or +; W32-NEXT: i64.store 16 +; W32-NEXT: local.get 3 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i64.load 24 +; W32-NEXT: i64.and +; W32-NEXT: local.get 4 +; W32-NEXT: local.get 2 +; W32-NEXT: i64.load 24 +; W32-NEXT: i64.and +; W32-NEXT: i64.or +; W32-NEXT: i64.store 24 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: cmovznz4_builtin_ctselect: +; W64: .functype cmovznz4_builtin_ctselect (i64, i64, i64, i64) -> () +; W64-NEXT: .local i64 +; W64-NEXT: # %bb.0: # %entry +; W64-NEXT: local.get 3 +; W64-NEXT: i64.const -1 +; W64-NEXT: i64.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i64.eqz +; W64-NEXT: i64.select +; W64-NEXT: local.tee 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i64.load 0 +; W64-NEXT: i64.and +; W64-NEXT: local.get 0 +; W64-NEXT: i64.const -1 +; W64-NEXT: i64.xor +; W64-NEXT: local.tee 4 +; W64-NEXT: local.get 2 +; W64-NEXT: i64.load 0 +; W64-NEXT: i64.and +; W64-NEXT: i64.or +; W64-NEXT: i64.store 0 +; W64-NEXT: local.get 3 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i64.load 8 +; W64-NEXT: i64.and +; W64-NEXT: local.get 4 +; W64-NEXT: local.get 2 +; W64-NEXT: i64.load 8 +; W64-NEXT: i64.and +; W64-NEXT: i64.or +; W64-NEXT: i64.store 8 +; W64-NEXT: local.get 3 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i64.load 16 +; W64-NEXT: i64.and +; W64-NEXT: local.get 4 +; W64-NEXT: local.get 2 +; W64-NEXT: i64.load 16 +; W64-NEXT: i64.and +; W64-NEXT: i64.or +; W64-NEXT: i64.store 16 +; W64-NEXT: local.get 3 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i64.load 24 +; W64-NEXT: i64.and +; W64-NEXT: local.get 4 +; W64-NEXT: local.get 2 +; W64-NEXT: i64.load 24 +; W64-NEXT: i64.and +; W64-NEXT: i64.or +; W64-NEXT: i64.store 24 +; W64-NEXT: # fallthrough-return +entry: + %cmp = icmp eq i64 %cin, 0 + %0 = load i64, ptr %x, align 8 + %1 = load i64, ptr %y, align 8 + %2 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %0, i64 %1) + store i64 %2, ptr %r, align 8 + %arrayidx4 = getelementptr inbounds nuw i8, ptr %x, i64 8 + %3 = load i64, ptr %arrayidx4, align 8 + %arrayidx5 = getelementptr inbounds nuw i8, ptr %y, i64 8 + %4 = load i64, ptr %arrayidx5, align 8 + %5 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %3, i64 %4) + %arrayidx6 = getelementptr inbounds nuw i8, ptr %r, i64 8 + store i64 %5, ptr %arrayidx6, align 8 + %arrayidx8 = getelementptr inbounds nuw i8, ptr %x, i64 16 + %6 = load i64, ptr %arrayidx8, align 8 + %arrayidx9 = getelementptr inbounds nuw i8, ptr %y, i64 16 + %7 = load i64, ptr %arrayidx9, align 8 + %8 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %6, i64 %7) + %arrayidx10 = getelementptr inbounds nuw i8, ptr %r, i64 16 + store i64 %8, ptr %arrayidx10, align 8 + %arrayidx12 = getelementptr inbounds nuw i8, ptr %x, i64 24 + %9 = load i64, ptr %arrayidx12, align 8 + %arrayidx13 = getelementptr inbounds nuw i8, ptr %y, i64 24 + %10 = load i64, ptr %arrayidx13, align 8 + %11 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %9, i64 %10) + %arrayidx14 = getelementptr inbounds nuw i8, ptr %r, i64 24 + store i64 %11, ptr %arrayidx14, align 8 + ret void +} + +; Declare the intrinsics +declare i1 @llvm.ct.select.i1(i1, i1, i1) +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare ptr @llvm.ct.select.p0(i1, ptr, ptr) diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-patterns.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-patterns.ll new file mode 100644 index 0000000000000..5c8d66249a95a --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-patterns.ll @@ -0,0 +1,611 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=wasm32-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W32 +; RUN: llc < %s -mtriple=wasm64-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W64 + +; Test smin(x, 0) pattern +define i32 @test_ctselect_smin_zero(i32 %x) { +; W32-LABEL: test_ctselect_smin_zero: +; W32: .functype test_ctselect_smin_zero (i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 31 +; W32-NEXT: i32.shr_s +; W32-NEXT: local.get 0 +; W32-NEXT: i32.and +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_smin_zero: +; W64: .functype test_ctselect_smin_zero (i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 31 +; W64-NEXT: i32.shr_s +; W64-NEXT: local.get 0 +; W64-NEXT: i32.and +; W64-NEXT: # fallthrough-return + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) + ret i32 %result +} + +; Test smax(x, 0) pattern +define i32 @test_ctselect_smax_zero(i32 %x) { +; W32-LABEL: test_ctselect_smax_zero: +; W32: .functype test_ctselect_smax_zero (i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 0 +; W32-NEXT: i32.gt_s +; W32-NEXT: i32.select +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_smax_zero: +; W64: .functype test_ctselect_smax_zero (i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 0 +; W64-NEXT: i32.gt_s +; W64-NEXT: i32.select +; W64-NEXT: # fallthrough-return + %cmp = icmp sgt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) + ret i32 %result +} + +; Test generic smin pattern +define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) { +; W32-LABEL: test_ctselect_smin_generic: +; W32: .functype test_ctselect_smin_generic (i32, i32) -> (i32) +; W32-NEXT: .local i32 +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.lt_s +; W32-NEXT: i32.select +; W32-NEXT: local.tee 2 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.and +; W32-NEXT: local.get 2 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_smin_generic: +; W64: .functype test_ctselect_smin_generic (i32, i32) -> (i32) +; W64-NEXT: .local i32 +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.lt_s +; W64-NEXT: i32.select +; W64-NEXT: local.tee 2 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.and +; W64-NEXT: local.get 2 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %cmp = icmp slt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test generic smax pattern +define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) { +; W32-LABEL: test_ctselect_smax_generic: +; W32: .functype test_ctselect_smax_generic (i32, i32) -> (i32) +; W32-NEXT: .local i32 +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.gt_s +; W32-NEXT: i32.select +; W32-NEXT: local.tee 2 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.and +; W32-NEXT: local.get 2 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_smax_generic: +; W64: .functype test_ctselect_smax_generic (i32, i32) -> (i32) +; W64-NEXT: .local i32 +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.gt_s +; W64-NEXT: i32.select +; W64-NEXT: local.tee 2 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.and +; W64-NEXT: local.get 2 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %cmp = icmp sgt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test umin pattern +define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) { +; W32-LABEL: test_ctselect_umin_generic: +; W32: .functype test_ctselect_umin_generic (i32, i32) -> (i32) +; W32-NEXT: .local i32 +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.lt_u +; W32-NEXT: i32.select +; W32-NEXT: local.tee 2 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.and +; W32-NEXT: local.get 2 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_umin_generic: +; W64: .functype test_ctselect_umin_generic (i32, i32) -> (i32) +; W64-NEXT: .local i32 +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.lt_u +; W64-NEXT: i32.select +; W64-NEXT: local.tee 2 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.and +; W64-NEXT: local.get 2 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %cmp = icmp ult i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test umax pattern +define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) { +; W32-LABEL: test_ctselect_umax_generic: +; W32: .functype test_ctselect_umax_generic (i32, i32) -> (i32) +; W32-NEXT: .local i32 +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.gt_u +; W32-NEXT: i32.select +; W32-NEXT: local.tee 2 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.and +; W32-NEXT: local.get 2 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_umax_generic: +; W64: .functype test_ctselect_umax_generic (i32, i32) -> (i32) +; W64-NEXT: .local i32 +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.gt_u +; W64-NEXT: i32.select +; W64-NEXT: local.tee 2 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.and +; W64-NEXT: local.get 2 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %cmp = icmp ugt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test abs pattern +define i32 @test_ctselect_abs(i32 %x) { +; W32-LABEL: test_ctselect_abs: +; W32: .functype test_ctselect_abs (i32) -> (i32) +; W32-NEXT: .local i32 +; W32-NEXT: # %bb.0: +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 31 +; W32-NEXT: i32.shr_s +; W32-NEXT: local.tee 1 +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.sub +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 0 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_abs: +; W64: .functype test_ctselect_abs (i32) -> (i32) +; W64-NEXT: .local i32 +; W64-NEXT: # %bb.0: +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 31 +; W64-NEXT: i32.shr_s +; W64-NEXT: local.tee 1 +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.sub +; W64-NEXT: i32.and +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 0 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %neg = sub i32 0, %x + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %neg, i32 %x) + ret i32 %result +} + +; Test nabs pattern (negative abs) +define i32 @test_ctselect_nabs(i32 %x) { +; W32-LABEL: test_ctselect_nabs: +; W32: .functype test_ctselect_nabs (i32) -> (i32) +; W32-NEXT: .local i32 +; W32-NEXT: # %bb.0: +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 31 +; W32-NEXT: i32.shr_s +; W32-NEXT: local.tee 1 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.sub +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_nabs: +; W64: .functype test_ctselect_nabs (i32) -> (i32) +; W64-NEXT: .local i32 +; W64-NEXT: # %bb.0: +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 31 +; W64-NEXT: i32.shr_s +; W64-NEXT: local.tee 1 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.and +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.sub +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %neg = sub i32 0, %x + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %neg) + ret i32 %result +} + +; Test sign extension pattern +define i32 @test_ctselect_sign_extend(i32 %x) { +; W32-LABEL: test_ctselect_sign_extend: +; W32: .functype test_ctselect_sign_extend (i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 31 +; W32-NEXT: i32.shr_s +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_sign_extend: +; W64: .functype test_ctselect_sign_extend (i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 31 +; W64-NEXT: i32.shr_s +; W64-NEXT: # fallthrough-return + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0) + ret i32 %result +} + +; Test zero extension pattern +define i32 @test_ctselect_zero_extend(i32 %x) { +; W32-LABEL: test_ctselect_zero_extend: +; W32: .functype test_ctselect_zero_extend (i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 0 +; W32-NEXT: i32.ne +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_zero_extend: +; W64: .functype test_ctselect_zero_extend (i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 0 +; W64-NEXT: i32.ne +; W64-NEXT: # fallthrough-return + %cmp = icmp ne i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0) + ret i32 %result +} + +; Test constant folding with known condition +define i32 @test_ctselect_constant_folding_true(i32 %a, i32 %b) { +; W32-LABEL: test_ctselect_constant_folding_true: +; W32: .functype test_ctselect_constant_folding_true (i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: local.get 0 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_constant_folding_true: +; W64: .functype test_ctselect_constant_folding_true (i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: local.get 0 +; W64-NEXT: # fallthrough-return + %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) { +; W32-LABEL: test_ctselect_constant_folding_false: +; W32: .functype test_ctselect_constant_folding_false (i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: local.get 1 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_constant_folding_false: +; W64: .functype test_ctselect_constant_folding_false (i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: local.get 1 +; W64-NEXT: # fallthrough-return + %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) + ret i32 %result +} + +; Test with identical operands +define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) { +; W32-LABEL: test_ctselect_identical_operands: +; W32: .functype test_ctselect_identical_operands (i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: local.get 1 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_identical_operands: +; W64: .functype test_ctselect_identical_operands (i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: local.get 1 +; W64-NEXT: # fallthrough-return + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x) + ret i32 %result +} + +; Test with inverted condition +define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) { +; W32-LABEL: test_ctselect_inverted_condition: +; W32: .functype test_ctselect_inverted_condition (i32, i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.ne +; W32-NEXT: i32.select +; W32-NEXT: local.tee 1 +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 3 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_inverted_condition: +; W64: .functype test_ctselect_inverted_condition (i32, i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.ne +; W64-NEXT: i32.select +; W64-NEXT: local.tee 1 +; W64-NEXT: local.get 2 +; W64-NEXT: i32.and +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 3 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %cmp = icmp eq i32 %x, %y + %not_cmp = xor i1 %cmp, true + %result = call i32 @llvm.ct.select.i32(i1 %not_cmp, i32 %a, i32 %b) + ret i32 %result +} + +; Test chain of ct.select operations +define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, i32 %d) { +; W32-LABEL: test_ctselect_chain: +; W32: .functype test_ctselect_chain (i32, i32, i32, i32, i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 2 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 2 +; W32-NEXT: i32.sub +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 1 +; W32-NEXT: i32.sub +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub +; W32-NEXT: local.get 3 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 4 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 5 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: i32.and +; W32-NEXT: local.get 2 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 6 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_chain: +; W64: .functype test_ctselect_chain (i32, i32, i32, i32, i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 2 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: local.tee 2 +; W64-NEXT: i32.sub +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: local.tee 1 +; W64-NEXT: i32.sub +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub +; W64-NEXT: local.get 3 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.add +; W64-NEXT: local.get 4 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: i32.and +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.add +; W64-NEXT: local.get 5 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: i32.and +; W64-NEXT: local.get 2 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.add +; W64-NEXT: local.get 6 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) + %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c) + %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d) + ret i32 %sel3 +} + +; Test for 64-bit operations (supported on all 64-bit architectures) +define i64 @test_ctselect_i64_smin_zero(i64 %x) { +; W32-LABEL: test_ctselect_i64_smin_zero: +; W32: .functype test_ctselect_i64_smin_zero (i64) -> (i64) +; W32-NEXT: # %bb.0: +; W32-NEXT: local.get 0 +; W32-NEXT: i64.const 63 +; W32-NEXT: i64.shr_s +; W32-NEXT: local.get 0 +; W32-NEXT: i64.and +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_i64_smin_zero: +; W64: .functype test_ctselect_i64_smin_zero (i64) -> (i64) +; W64-NEXT: # %bb.0: +; W64-NEXT: local.get 0 +; W64-NEXT: i64.const 63 +; W64-NEXT: i64.shr_s +; W64-NEXT: local.get 0 +; W64-NEXT: i64.and +; W64-NEXT: # fallthrough-return + %cmp = icmp slt i64 %x, 0 + %result = call i64 @llvm.ct.select.i64(i1 %cmp, i64 %x, i64 0) + ret i64 %result +} + +; Declare the intrinsics +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare i64 @llvm.ct.select.i64(i1, i64, i64) diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-vector.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-vector.ll new file mode 100644 index 0000000000000..daa7370fb481a --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-vector.ll @@ -0,0 +1,566 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=wasm32-unknown-unknown -O3 -mattr=+simd128 | FileCheck %s --check-prefix=WASM32 +; RUN: llc < %s -mtriple=wasm64-unknown-unknown -O3 -mattr=+simd128 | FileCheck %s --check-prefix=WASM64 + +; Test 32-bit integer vector (4 x i32 = 128-bit) +define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { +; WASM32-LABEL: test_ctselect_v4i32: +; WASM32: .functype test_ctselect_v4i32 (i32, v128, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shl +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v4i32: +; WASM64: .functype test_ctselect_v4i32 (i32, v128, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shl +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +; Test 16-bit integer vector (8 x i16 = 128-bit) +define <8 x i16> @test_ctselect_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) { +; WASM32-LABEL: test_ctselect_v8i16: +; WASM32: .functype test_ctselect_v8i16 (i32, v128, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i16x8.splat +; WASM32-NEXT: i32.const 15 +; WASM32-NEXT: i16x8.shl +; WASM32-NEXT: i32.const 15 +; WASM32-NEXT: i16x8.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v8i16: +; WASM64: .functype test_ctselect_v8i16 (i32, v128, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i16x8.splat +; WASM64-NEXT: i32.const 15 +; WASM64-NEXT: i16x8.shl +; WASM64-NEXT: i32.const 15 +; WASM64-NEXT: i16x8.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %result = call <8 x i16> @llvm.ct.select.v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %result +} + +; Test byte vector (16 x i8 = 128-bit) +define <16 x i8> @test_ctselect_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) { +; WASM32-LABEL: test_ctselect_v16i8: +; WASM32: .functype test_ctselect_v16i8 (i32, v128, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i8x16.splat +; WASM32-NEXT: i32.const 7 +; WASM32-NEXT: i8x16.shl +; WASM32-NEXT: i32.const 7 +; WASM32-NEXT: i8x16.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v16i8: +; WASM64: .functype test_ctselect_v16i8 (i32, v128, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i8x16.splat +; WASM64-NEXT: i32.const 7 +; WASM64-NEXT: i8x16.shl +; WASM64-NEXT: i32.const 7 +; WASM64-NEXT: i8x16.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %result = call <16 x i8> @llvm.ct.select.v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %result +} + +; Test 64-bit integer vector (2 x i64 = 128-bit) +define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { +; WASM32-LABEL: test_ctselect_v2i64: +; WASM32: .functype test_ctselect_v2i64 (i32, v128, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 63 +; WASM32-NEXT: i64x2.shl +; WASM32-NEXT: i32.const 63 +; WASM32-NEXT: i64x2.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v2i64: +; WASM64: .functype test_ctselect_v2i64 (i32, v128, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 63 +; WASM64-NEXT: i64x2.shl +; WASM64-NEXT: i32.const 63 +; WASM64-NEXT: i64x2.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %result = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %result +} + +; Test single-precision float vector (4 x float = 128-bit) +define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) { +; WASM32-LABEL: test_ctselect_v4f32: +; WASM32: .functype test_ctselect_v4f32 (i32, v128, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shl +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v4f32: +; WASM64: .functype test_ctselect_v4f32 (i32, v128, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shl +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) + ret <4 x float> %result +} + +; Test double-precision float vector (2 x double = 128-bit) +define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) { +; WASM32-LABEL: test_ctselect_v2f64: +; WASM32: .functype test_ctselect_v2f64 (i32, v128, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 63 +; WASM32-NEXT: i64x2.shl +; WASM32-NEXT: i32.const 63 +; WASM32-NEXT: i64x2.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v2f64: +; WASM64: .functype test_ctselect_v2f64 (i32, v128, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 63 +; WASM64-NEXT: i64x2.shl +; WASM64-NEXT: i32.const 63 +; WASM64-NEXT: i64x2.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %result = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) + ret <2 x double> %result +} + +; Test with aligned loads (common case) +define <4 x i32> @test_ctselect_v4i32_aligned_load(i1 %cond, ptr %p1, ptr %p2) { +; WASM32-LABEL: test_ctselect_v4i32_aligned_load: +; WASM32: .functype test_ctselect_v4i32_aligned_load (i32, i32, i32) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: v128.load 0 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: v128.load 0 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shl +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v4i32_aligned_load: +; WASM64: .functype test_ctselect_v4i32_aligned_load (i32, i64, i64) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: v128.load 0 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: v128.load 0 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shl +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %a = load <4 x i32>, ptr %p1, align 16 + %b = load <4 x i32>, ptr %p2, align 16 + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +; Test with unaligned loads (stress test) +define <4 x i32> @test_ctselect_v4i32_unaligned_load(i1 %cond, ptr %p1, ptr %p2) { +; WASM32-LABEL: test_ctselect_v4i32_unaligned_load: +; WASM32: .functype test_ctselect_v4i32_unaligned_load (i32, i32, i32) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: v128.load 0:p2align=2 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: v128.load 0:p2align=2 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shl +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v4i32_unaligned_load: +; WASM64: .functype test_ctselect_v4i32_unaligned_load (i32, i64, i64) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: v128.load 0:p2align=2 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: v128.load 0:p2align=2 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shl +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %a = load <4 x i32>, ptr %p1, align 4 + %b = load <4 x i32>, ptr %p2, align 4 + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +; Test with stores to verify result handling +define void @test_ctselect_v4i32_store(i1 %cond, <4 x i32> %a, <4 x i32> %b, ptr %out) { +; WASM32-LABEL: test_ctselect_v4i32_store: +; WASM32: .functype test_ctselect_v4i32_store (i32, v128, v128, i32) -> () +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 3 +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shl +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: v128.store 0 +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v4i32_store: +; WASM64: .functype test_ctselect_v4i32_store (i32, v128, v128, i64) -> () +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 3 +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shl +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: v128.store 0 +; WASM64-NEXT: # fallthrough-return + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + store <4 x i32> %result, ptr %out, align 16 + ret void +} + +; Test chained selects (multiple conditions) +define <4 x i32> @test_ctselect_v4i32_chain(i1 %cond1, i1 %cond2, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; WASM32-LABEL: test_ctselect_v4i32_chain: +; WASM32: .functype test_ctselect_v4i32_chain (i32, i32, v128, v128, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 3 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shl +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: local.get 4 +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shl +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v4i32_chain: +; WASM64: .functype test_ctselect_v4i32_chain (i32, i32, v128, v128, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 3 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shl +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: local.get 4 +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shl +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %tmp = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond1, <4 x i32> %a, <4 x i32> %b) + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond2, <4 x i32> %tmp, <4 x i32> %c) + ret <4 x i32> %result +} + +; Test with arithmetic operations (ensure float vectors work with FP ops) +define <4 x float> @test_ctselect_v4f32_arithmetic(i1 %cond, <4 x float> %x, <4 x float> %y) { +; WASM32-LABEL: test_ctselect_v4f32_arithmetic: +; WASM32: .functype test_ctselect_v4f32_arithmetic (i32, v128, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: f32x4.add +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: f32x4.sub +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shl +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v4f32_arithmetic: +; WASM64: .functype test_ctselect_v4f32_arithmetic (i32, v128, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: f32x4.add +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: f32x4.sub +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shl +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %sum = fadd <4 x float> %x, %y + %diff = fsub <4 x float> %x, %y + %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %sum, <4 x float> %diff) + ret <4 x float> %result +} + +; Test with zero vectors +define <4 x i32> @test_ctselect_v4i32_zeros(i1 %cond, <4 x i32> %a) { +; WASM32-LABEL: test_ctselect_v4i32_zeros: +; WASM32: .functype test_ctselect_v4i32_zeros (i32, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shl +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shr_s +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: v128.and +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v4i32_zeros: +; WASM64: .functype test_ctselect_v4i32_zeros (i32, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shl +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shr_s +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: v128.and +; WASM64-NEXT: # fallthrough-return + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, + <4 x i32> %a, + <4 x i32> zeroinitializer) + ret <4 x i32> %result +} + +; Test with function arguments directly (no loads) +define <4 x i32> @test_ctselect_v4i32_args(i1 %cond, <4 x i32> %a, <4 x i32> %b) nounwind { +; WASM32-LABEL: test_ctselect_v4i32_args: +; WASM32: .functype test_ctselect_v4i32_args (i32, v128, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shl +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v4i32_args: +; WASM64: .functype test_ctselect_v4i32_args (i32, v128, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shl +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +; Test with multiple uses of result +define <4 x i32> @test_ctselect_v4i32_multi_use(i1 %cond, <4 x i32> %a, <4 x i32> %b) { +; WASM32-LABEL: test_ctselect_v4i32_multi_use: +; WASM32: .functype test_ctselect_v4i32_multi_use (i32, v128, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i32x4.splat +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shl +; WASM32-NEXT: i32.const 31 +; WASM32-NEXT: i32x4.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: local.tee 2 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: i32x4.add +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v4i32_multi_use: +; WASM64: .functype test_ctselect_v4i32_multi_use (i32, v128, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i32x4.splat +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shl +; WASM64-NEXT: i32.const 31 +; WASM64-NEXT: i32x4.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: local.tee 2 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: i32x4.add +; WASM64-NEXT: # fallthrough-return + %sel = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + %add = add <4 x i32> %sel, %sel ; Use result twice + ret <4 x i32> %add +} + +; Test byte vector with operations +define <16 x i8> @test_ctselect_v16i8_ops(i1 %cond, <16 x i8> %x, <16 x i8> %y) { +; WASM32-LABEL: test_ctselect_v16i8_ops: +; WASM32: .functype test_ctselect_v16i8_ops (i32, v128, v128) -> (v128) +; WASM32-NEXT: # %bb.0: +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: v128.xor +; WASM32-NEXT: local.get 1 +; WASM32-NEXT: local.get 2 +; WASM32-NEXT: v128.and +; WASM32-NEXT: local.get 0 +; WASM32-NEXT: i8x16.splat +; WASM32-NEXT: i32.const 7 +; WASM32-NEXT: i8x16.shl +; WASM32-NEXT: i32.const 7 +; WASM32-NEXT: i8x16.shr_s +; WASM32-NEXT: v128.bitselect +; WASM32-NEXT: # fallthrough-return +; +; WASM64-LABEL: test_ctselect_v16i8_ops: +; WASM64: .functype test_ctselect_v16i8_ops (i32, v128, v128) -> (v128) +; WASM64-NEXT: # %bb.0: +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: v128.xor +; WASM64-NEXT: local.get 1 +; WASM64-NEXT: local.get 2 +; WASM64-NEXT: v128.and +; WASM64-NEXT: local.get 0 +; WASM64-NEXT: i8x16.splat +; WASM64-NEXT: i32.const 7 +; WASM64-NEXT: i8x16.shl +; WASM64-NEXT: i32.const 7 +; WASM64-NEXT: i8x16.shr_s +; WASM64-NEXT: v128.bitselect +; WASM64-NEXT: # fallthrough-return + %xor = xor <16 x i8> %x, %y + %and = and <16 x i8> %x, %y + %result = call <16 x i8> @llvm.ct.select.v16i8(i1 %cond, <16 x i8> %xor, <16 x i8> %and) + ret <16 x i8> %result +} + +declare <4 x i32> @llvm.ct.select.v4i32(i1, <4 x i32>, <4 x i32>) +declare <8 x i16> @llvm.ct.select.v8i16(i1, <8 x i16>, <8 x i16>) +declare <16 x i8> @llvm.ct.select.v16i8(i1, <16 x i8>, <16 x i8>) +declare <2 x i64> @llvm.ct.select.v2i64(i1, <2 x i64>, <2 x i64>) +declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>) +declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>) diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll new file mode 100644 index 0000000000000..4e356f8562b39 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll @@ -0,0 +1,909 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=wasm32-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W32 +; RUN: llc < %s -mtriple=wasm64-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W64 + +; Test basic ct.select functionality for scalar types +define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) { +; W32-LABEL: test_ctselect_i8: +; W32: .functype test_ctselect_i8 (i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_i8: +; W64: .functype test_ctselect_i8 (i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub +; W64-NEXT: local.get 1 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.add +; W64-NEXT: local.get 2 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) + ret i8 %result +} + +define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) { +; W32-LABEL: test_ctselect_i16: +; W32: .functype test_ctselect_i16 (i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_i16: +; W64: .functype test_ctselect_i16 (i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub +; W64-NEXT: local.get 1 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.add +; W64-NEXT: local.get 2 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) + ret i16 %result +} + +define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { +; W32-LABEL: test_ctselect_i32: +; W32: .functype test_ctselect_i32 (i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_i32: +; W64: .functype test_ctselect_i32 (i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub +; W64-NEXT: local.get 1 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.add +; W64-NEXT: local.get 2 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { +; W32-LABEL: test_ctselect_i64: +; W32: .functype test_ctselect_i64 (i32, i64, i64) -> (i64) +; W32-NEXT: .local i64 +; W32-NEXT: # %bb.0: +; W32-NEXT: i64.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i64.extend_i32_u +; W32-NEXT: i64.const 1 +; W32-NEXT: i64.and +; W32-NEXT: local.tee 3 +; W32-NEXT: i64.sub +; W32-NEXT: local.get 1 +; W32-NEXT: i64.and +; W32-NEXT: local.get 3 +; W32-NEXT: i64.const -1 +; W32-NEXT: i64.add +; W32-NEXT: local.get 2 +; W32-NEXT: i64.and +; W32-NEXT: i64.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_i64: +; W64: .functype test_ctselect_i64 (i32, i64, i64) -> (i64) +; W64-NEXT: .local i64 +; W64-NEXT: # %bb.0: +; W64-NEXT: i64.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i64.extend_i32_u +; W64-NEXT: i64.const 1 +; W64-NEXT: i64.and +; W64-NEXT: local.tee 3 +; W64-NEXT: i64.sub +; W64-NEXT: local.get 1 +; W64-NEXT: i64.and +; W64-NEXT: local.get 3 +; W64-NEXT: i64.const -1 +; W64-NEXT: i64.add +; W64-NEXT: local.get 2 +; W64-NEXT: i64.and +; W64-NEXT: i64.or +; W64-NEXT: # fallthrough-return + %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b) + ret i64 %result +} + +define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) { +; W32-LABEL: test_ctselect_ptr: +; W32: .functype test_ctselect_ptr (i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_ptr: +; W64: .functype test_ctselect_ptr (i32, i64, i64) -> (i64) +; W64-NEXT: .local i64 +; W64-NEXT: # %bb.0: +; W64-NEXT: i64.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i64.extend_i32_u +; W64-NEXT: i64.const 1 +; W64-NEXT: i64.and +; W64-NEXT: local.tee 3 +; W64-NEXT: i64.sub +; W64-NEXT: local.get 1 +; W64-NEXT: i64.and +; W64-NEXT: local.get 3 +; W64-NEXT: i64.const -1 +; W64-NEXT: i64.add +; W64-NEXT: local.get 2 +; W64-NEXT: i64.and +; W64-NEXT: i64.or +; W64-NEXT: # fallthrough-return + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) + ret ptr %result +} + +; Test with constant conditions +define i32 @test_ctselect_const_true(i32 %a, i32 %b) { +; W32-LABEL: test_ctselect_const_true: +; W32: .functype test_ctselect_const_true (i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: local.get 0 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_const_true: +; W64: .functype test_ctselect_const_true (i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: local.get 0 +; W64-NEXT: # fallthrough-return + %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_const_false(i32 %a, i32 %b) { +; W32-LABEL: test_ctselect_const_false: +; W32: .functype test_ctselect_const_false (i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: local.get 1 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_const_false: +; W64: .functype test_ctselect_const_false (i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: local.get 1 +; W64-NEXT: # fallthrough-return + %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) + ret i32 %result +} + +; Test with comparison conditions +define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) { +; W32-LABEL: test_ctselect_icmp_eq: +; W32: .functype test_ctselect_icmp_eq (i32, i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.eq +; W32-NEXT: i32.select +; W32-NEXT: local.tee 1 +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 3 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_icmp_eq: +; W64: .functype test_ctselect_icmp_eq (i32, i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.eq +; W64-NEXT: i32.select +; W64-NEXT: local.tee 1 +; W64-NEXT: local.get 2 +; W64-NEXT: i32.and +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 3 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %cond = icmp eq i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) { +; W32-LABEL: test_ctselect_icmp_ne: +; W32: .functype test_ctselect_icmp_ne (i32, i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.ne +; W32-NEXT: i32.select +; W32-NEXT: local.tee 1 +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 3 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_icmp_ne: +; W64: .functype test_ctselect_icmp_ne (i32, i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.ne +; W64-NEXT: i32.select +; W64-NEXT: local.tee 1 +; W64-NEXT: local.get 2 +; W64-NEXT: i32.and +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 3 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %cond = icmp ne i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) { +; W32-LABEL: test_ctselect_icmp_slt: +; W32: .functype test_ctselect_icmp_slt (i32, i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.lt_s +; W32-NEXT: i32.select +; W32-NEXT: local.tee 1 +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 3 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_icmp_slt: +; W64: .functype test_ctselect_icmp_slt (i32, i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.lt_s +; W64-NEXT: i32.select +; W64-NEXT: local.tee 1 +; W64-NEXT: local.get 2 +; W64-NEXT: i32.and +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 3 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %cond = icmp slt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) { +; W32-LABEL: test_ctselect_icmp_ult: +; W32: .functype test_ctselect_icmp_ult (i32, i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.lt_u +; W32-NEXT: i32.select +; W32-NEXT: local.tee 1 +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.xor +; W32-NEXT: local.get 3 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_icmp_ult: +; W64: .functype test_ctselect_icmp_ult (i32, i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.lt_u +; W64-NEXT: i32.select +; W64-NEXT: local.tee 1 +; W64-NEXT: local.get 2 +; W64-NEXT: i32.and +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.xor +; W64-NEXT: local.get 3 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %cond = icmp ult i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test with memory operands +define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { +; W32-LABEL: test_ctselect_load: +; W32: .functype test_ctselect_load (i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub +; W32-NEXT: local.get 1 +; W32-NEXT: i32.load 0 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 2 +; W32-NEXT: i32.load 0 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_load: +; W64: .functype test_ctselect_load (i32, i64, i64) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub +; W64-NEXT: local.get 1 +; W64-NEXT: i32.load 0 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.add +; W64-NEXT: local.get 2 +; W64-NEXT: i32.load 0 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %a = load i32, ptr %p1 + %b = load i32, ptr %p2 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test nested ctselect calls +define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { +; W32-LABEL: test_ctselect_nested: +; W32: .functype test_ctselect_nested (i32, i32, i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 1 +; W32-NEXT: i32.sub +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 3 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 4 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_nested: +; W64: .functype test_ctselect_nested (i32, i32, i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: local.tee 1 +; W64-NEXT: i32.sub +; W64-NEXT: local.get 2 +; W64-NEXT: i32.and +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.add +; W64-NEXT: local.get 3 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.add +; W64-NEXT: local.get 4 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b) + %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c) + ret i32 %result +} + +; Test float (32-bit) +define float @test_ctselect_f32(i1 %cond, float %a, float %b) { +; W32-LABEL: test_ctselect_f32: +; W32: .functype test_ctselect_f32 (i32, f32, f32) -> (f32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub +; W32-NEXT: local.get 1 +; W32-NEXT: i32.reinterpret_f32 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 2 +; W32-NEXT: i32.reinterpret_f32 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: f32.reinterpret_i32 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_f32: +; W64: .functype test_ctselect_f32 (i32, f32, f32) -> (f32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub +; W64-NEXT: local.get 1 +; W64-NEXT: i32.reinterpret_f32 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.add +; W64-NEXT: local.get 2 +; W64-NEXT: i32.reinterpret_f32 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: f32.reinterpret_i32 +; W64-NEXT: # fallthrough-return + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +; Test double (64-bit) +define double @test_ctselect_f64(i1 %cond, double %a, double %b) { +; W32-LABEL: test_ctselect_f64: +; W32: .functype test_ctselect_f64 (i32, f64, f64) -> (f64) +; W32-NEXT: .local i64 +; W32-NEXT: # %bb.0: +; W32-NEXT: i64.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i64.extend_i32_u +; W32-NEXT: i64.const 1 +; W32-NEXT: i64.and +; W32-NEXT: local.tee 3 +; W32-NEXT: i64.sub +; W32-NEXT: local.get 1 +; W32-NEXT: i64.reinterpret_f64 +; W32-NEXT: i64.and +; W32-NEXT: local.get 3 +; W32-NEXT: i64.const -1 +; W32-NEXT: i64.add +; W32-NEXT: local.get 2 +; W32-NEXT: i64.reinterpret_f64 +; W32-NEXT: i64.and +; W32-NEXT: i64.or +; W32-NEXT: f64.reinterpret_i64 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_f64: +; W64: .functype test_ctselect_f64 (i32, f64, f64) -> (f64) +; W64-NEXT: .local i64 +; W64-NEXT: # %bb.0: +; W64-NEXT: i64.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i64.extend_i32_u +; W64-NEXT: i64.const 1 +; W64-NEXT: i64.and +; W64-NEXT: local.tee 3 +; W64-NEXT: i64.sub +; W64-NEXT: local.get 1 +; W64-NEXT: i64.reinterpret_f64 +; W64-NEXT: i64.and +; W64-NEXT: local.get 3 +; W64-NEXT: i64.const -1 +; W64-NEXT: i64.add +; W64-NEXT: local.get 2 +; W64-NEXT: i64.reinterpret_f64 +; W64-NEXT: i64.and +; W64-NEXT: i64.or +; W64-NEXT: f64.reinterpret_i64 +; W64-NEXT: # fallthrough-return + %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) + ret double %result +} + + +; Test chained float selects +define float @test_ctselect_f32_chain(i1 %cond1, i1 %cond2, float %a, float %b, float %c) { +; W32-LABEL: test_ctselect_f32_chain: +; W32: .functype test_ctselect_f32_chain (i32, i32, f32, f32, f32) -> (f32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 1 +; W32-NEXT: i32.sub +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub +; W32-NEXT: local.get 2 +; W32-NEXT: i32.reinterpret_f32 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 3 +; W32-NEXT: i32.reinterpret_f32 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: i32.and +; W32-NEXT: local.get 1 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 4 +; W32-NEXT: i32.reinterpret_f32 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: f32.reinterpret_i32 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_f32_chain: +; W64: .functype test_ctselect_f32_chain (i32, i32, f32, f32, f32) -> (f32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: local.tee 1 +; W64-NEXT: i32.sub +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub +; W64-NEXT: local.get 2 +; W64-NEXT: i32.reinterpret_f32 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.add +; W64-NEXT: local.get 3 +; W64-NEXT: i32.reinterpret_f32 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: i32.and +; W64-NEXT: local.get 1 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.add +; W64-NEXT: local.get 4 +; W64-NEXT: i32.reinterpret_f32 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: f32.reinterpret_i32 +; W64-NEXT: # fallthrough-return + %tmp = call float @llvm.ct.select.f32(i1 %cond1, float %a, float %b) + %result = call float @llvm.ct.select.f32(i1 %cond2, float %tmp, float %c) + ret float %result +} + +; Test with float load +define float @test_ctselect_f32_load(i1 %cond, ptr %p1, ptr %p2) { +; W32-LABEL: test_ctselect_f32_load: +; W32: .functype test_ctselect_f32_load (i32, i32, i32) -> (f32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub +; W32-NEXT: local.get 1 +; W32-NEXT: i32.load 0 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 2 +; W32-NEXT: i32.load 0 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: f32.reinterpret_i32 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_f32_load: +; W64: .functype test_ctselect_f32_load (i32, i64, i64) -> (f32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub +; W64-NEXT: local.get 1 +; W64-NEXT: i32.load 0 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.add +; W64-NEXT: local.get 2 +; W64-NEXT: i32.load 0 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: f32.reinterpret_i32 +; W64-NEXT: # fallthrough-return + %a = load float, ptr %p1 + %b = load float, ptr %p2 + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +; Test with double load +define double @test_ctselect_f64_load(i1 %cond, ptr %p1, ptr %p2) { +; W32-LABEL: test_ctselect_f64_load: +; W32: .functype test_ctselect_f64_load (i32, i32, i32) -> (f64) +; W32-NEXT: .local i64 +; W32-NEXT: # %bb.0: +; W32-NEXT: i64.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i64.extend_i32_u +; W32-NEXT: i64.const 1 +; W32-NEXT: i64.and +; W32-NEXT: local.tee 3 +; W32-NEXT: i64.sub +; W32-NEXT: local.get 1 +; W32-NEXT: i64.load 0 +; W32-NEXT: i64.and +; W32-NEXT: local.get 3 +; W32-NEXT: i64.const -1 +; W32-NEXT: i64.add +; W32-NEXT: local.get 2 +; W32-NEXT: i64.load 0 +; W32-NEXT: i64.and +; W32-NEXT: i64.or +; W32-NEXT: f64.reinterpret_i64 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_f64_load: +; W64: .functype test_ctselect_f64_load (i32, i64, i64) -> (f64) +; W64-NEXT: .local i64 +; W64-NEXT: # %bb.0: +; W64-NEXT: i64.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i64.extend_i32_u +; W64-NEXT: i64.const 1 +; W64-NEXT: i64.and +; W64-NEXT: local.tee 3 +; W64-NEXT: i64.sub +; W64-NEXT: local.get 1 +; W64-NEXT: i64.load 0 +; W64-NEXT: i64.and +; W64-NEXT: local.get 3 +; W64-NEXT: i64.const -1 +; W64-NEXT: i64.add +; W64-NEXT: local.get 2 +; W64-NEXT: i64.load 0 +; W64-NEXT: i64.and +; W64-NEXT: i64.or +; W64-NEXT: f64.reinterpret_i64 +; W64-NEXT: # fallthrough-return + %a = load double, ptr %p1 + %b = load double, ptr %p2 + %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) + ret double %result +} + +; Test mixed with arithmetic +define float @test_ctselect_f32_arithmetic(i1 %cond, float %x, float %y) { +; W32-LABEL: test_ctselect_f32_arithmetic: +; W32: .functype test_ctselect_f32_arithmetic (i32, f32, f32) -> (f32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub +; W32-NEXT: local.get 1 +; W32-NEXT: local.get 2 +; W32-NEXT: f32.add +; W32-NEXT: i32.reinterpret_f32 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 1 +; W32-NEXT: local.get 2 +; W32-NEXT: f32.sub +; W32-NEXT: i32.reinterpret_f32 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: f32.reinterpret_i32 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_ctselect_f32_arithmetic: +; W64: .functype test_ctselect_f32_arithmetic (i32, f32, f32) -> (f32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub +; W64-NEXT: local.get 1 +; W64-NEXT: local.get 2 +; W64-NEXT: f32.add +; W64-NEXT: i32.reinterpret_f32 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.add +; W64-NEXT: local.get 1 +; W64-NEXT: local.get 2 +; W64-NEXT: f32.sub +; W64-NEXT: i32.reinterpret_f32 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: f32.reinterpret_i32 +; W64-NEXT: # fallthrough-return + %sum = fadd float %x, %y + %diff = fsub float %x, %y + %result = call float @llvm.ct.select.f32(i1 %cond, float %sum, float %diff) + ret float %result +} + +; Declare the intrinsics +; Declare the intrinsics +declare i8 @llvm.ct.select.i8(i1, i8, i8) +declare i16 @llvm.ct.select.i16(i1, i16, i16) +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare i64 @llvm.ct.select.i64(i1, i64, i64) +declare ptr @llvm.ct.select.p0(i1, ptr, ptr) +declare float @llvm.ct.select.f32(i1, float, float) +declare double @llvm.ct.select.f64(i1, double, double) diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-side-effects.ll b/llvm/test/CodeGen/WebAssembly/ctselect-side-effects.ll new file mode 100644 index 0000000000000..5b20e892c64d2 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/ctselect-side-effects.ll @@ -0,0 +1,226 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=wasm32-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W32 +; RUN: llc < %s -mtriple=wasm64-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W64 + +; Test 1: Basic optimizations should still work +define i32 @test_basic_opts(i32 %x) { +; W32-LABEL: test_basic_opts: +; W32: .functype test_basic_opts (i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: local.get 0 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_basic_opts: +; W64: .functype test_basic_opts (i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: local.get 0 +; W64-NEXT: # fallthrough-return + %a = or i32 %x, 0 ; Should eliminate + %b = and i32 %a, -1 ; Should eliminate + %c = xor i32 %b, 0 ; Should eliminate + ret i32 %c +} + +; Test 2: Constant folding should work +define i32 @test_constant_fold() { +; W32-LABEL: test_constant_fold: +; W32: .functype test_constant_fold () -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_constant_fold: +; W64: .functype test_constant_fold () -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: # fallthrough-return + %a = xor i32 -1, -1 ; Should fold to 0 + ret i32 %a +} + +; Test 3: Protected pattern should NOT have branches +define i32 @test_protected_no_branch(i1 %cond, i32 %a, i32 %b) { +; W32-LABEL: test_protected_no_branch: +; W32: .functype test_protected_no_branch (i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: local.tee 0 +; W32-NEXT: i32.sub +; W32-NEXT: local.get 1 +; W32-NEXT: i32.and +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const -1 +; W32-NEXT: i32.add +; W32-NEXT: local.get 2 +; W32-NEXT: i32.and +; W32-NEXT: i32.or +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_protected_no_branch: +; W64: .functype test_protected_no_branch (i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: local.tee 0 +; W64-NEXT: i32.sub +; W64-NEXT: local.get 1 +; W64-NEXT: i32.and +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const -1 +; W64-NEXT: i32.add +; W64-NEXT: local.get 2 +; W64-NEXT: i32.and +; W64-NEXT: i32.or +; W64-NEXT: # fallthrough-return + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test 4: Explicit branch should still generate branches +define i32 @test_explicit_branch(i1 %cond, i32 %a, i32 %b) { +; W32-LABEL: test_explicit_branch: +; W32: .functype test_explicit_branch (i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: block +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.eqz +; W32-NEXT: br_if 0 # 0: down to label0 +; W32-NEXT: # %bb.1: # %true +; W32-NEXT: local.get 1 +; W32-NEXT: return +; W32-NEXT: .LBB3_2: # %false +; W32-NEXT: end_block # label0: +; W32-NEXT: local.get 2 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_explicit_branch: +; W64: .functype test_explicit_branch (i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: block +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.eqz +; W64-NEXT: br_if 0 # 0: down to label0 +; W64-NEXT: # %bb.1: # %true +; W64-NEXT: local.get 1 +; W64-NEXT: return +; W64-NEXT: .LBB3_2: # %false +; W64-NEXT: end_block # label0: +; W64-NEXT: local.get 2 +; W64-NEXT: # fallthrough-return + br i1 %cond, label %true, label %false +true: + ret i32 %a +false: + ret i32 %b +} + +; Test 5: Regular select (not ct.select) - whatever wasm wants to do +define i32 @test_regular_select(i1 %cond, i32 %a, i32 %b) { +; W32-LABEL: test_regular_select: +; W32: .functype test_regular_select (i32, i32, i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: local.get 1 +; W32-NEXT: local.get 2 +; W32-NEXT: local.get 0 +; W32-NEXT: i32.const 1 +; W32-NEXT: i32.and +; W32-NEXT: i32.select +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_regular_select: +; W64: .functype test_regular_select (i32, i32, i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: local.get 1 +; W64-NEXT: local.get 2 +; W64-NEXT: local.get 0 +; W64-NEXT: i32.const 1 +; W64-NEXT: i32.and +; W64-NEXT: i32.select +; W64-NEXT: # fallthrough-return + %result = select i1 %cond, i32 %a, i32 %b + ret i32 %result +} + +; Test if XOR with all-ones still gets optimized +define i32 @test_xor_all_ones() { +; W32-LABEL: test_xor_all_ones: +; W32: .functype test_xor_all_ones () -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_xor_all_ones: +; W64: .functype test_xor_all_ones () -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: # fallthrough-return + %xor1 = xor i32 -1, -1 ; Should optimize to 0 + ret i32 %xor1 +} + +define i32 @test_xor_same_value(i32 %x) { +; W32-LABEL: test_xor_same_value: +; W32: .functype test_xor_same_value (i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_xor_same_value: +; W64: .functype test_xor_same_value (i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: # fallthrough-return + %xor2 = xor i32 %x, %x ; Should optimize to 0 + ret i32 %xor2 +} + +define i32 @test_normal_ops(i32 %x) { +; W32-LABEL: test_normal_ops: +; W32: .functype test_normal_ops (i32) -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: local.get 0 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_normal_ops: +; W64: .functype test_normal_ops (i32) -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: local.get 0 +; W64-NEXT: # fallthrough-return + %or1 = or i32 %x, 0 + %and1 = and i32 %or1, -1 + %xor1 = xor i32 %and1, 0 + ret i32 %xor1 +} + +; This simulates what the reviewer is worried about +define i32 @test_xor_with_const_operands() { +; W32-LABEL: test_xor_with_const_operands: +; W32: .functype test_xor_with_const_operands () -> (i32) +; W32-NEXT: # %bb.0: +; W32-NEXT: i32.const 0 +; W32-NEXT: # fallthrough-return +; +; W64-LABEL: test_xor_with_const_operands: +; W64: .functype test_xor_with_const_operands () -> (i32) +; W64-NEXT: # %bb.0: +; W64-NEXT: i32.const 0 +; W64-NEXT: # fallthrough-return + %a = xor i32 -1, -1 + %b = xor i32 0, 0 + %c = xor i32 42, 42 + %result = or i32 %a, %b + %final = or i32 %result, %c + ret i32 %final ; Should optimize to 0 +} + +declare i32 @llvm.ct.select.i32(i1, i32, i32) + diff --git a/llvm/test/CodeGen/X86/ctselect-edge-cases.ll b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll new file mode 100644 index 0000000000000..0797265972a1f --- /dev/null +++ b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll @@ -0,0 +1,409 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X32 + +; Test ct.select edge cases and corner cases + +; Test with very large integers +define i128 @test_ctselect_i128(i1 %cond, i128 %a, i128 %b) { +; X64-LABEL: test_ctselect_i128: +; X64: # %bb.0: +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovneq %rsi, %rax +; X64-NEXT: cmovneq %rdx, %r8 +; X64-NEXT: movq %r8, %rdx +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_i128: +; X32: # %bb.0: +; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: pushl %eax +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %edi, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %esi +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edi +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edx +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl %ecx, 12(%eax) +; X32-NEXT: movl %edx, 8(%eax) +; X32-NEXT: movl %edi, 4(%eax) +; X32-NEXT: movl %esi, (%eax) +; X32-NEXT: addl $4, %esp +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %edi +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl $4 + %result = call i128 @llvm.ct.select.i128(i1 %cond, i128 %a, i128 %b) + ret i128 %result +} + +; Test with small integer types +define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) { +; X64-LABEL: test_ctselect_i1: +; X64: # %bb.0: +; X64-NEXT: movl %edx, %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %esi, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_i1: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: # kill: def $al killed $al killed $eax +; X32-NEXT: retl + %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b) + ret i1 %result +} + +; Test with extremal values +define i32 @test_ctselect_extremal_values(i1 %cond) { +; X64-LABEL: test_ctselect_extremal_values: +; X64: # %bb.0: +; X64-NEXT: testb $1, %dil +; X64-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF +; X64-NEXT: movl $-2147483648, %eax # imm = 0x80000000 +; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_extremal_values: +; X32: # %bb.0: +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF +; X32-NEXT: movl $-2147483648, %eax # imm = 0x80000000 +; X32-NEXT: cmovnel %ecx, %eax +; X32-NEXT: retl + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648) + ret i32 %result +} + +; Test with floating point special values +define float @test_ctselect_f32_special_values(i1 %cond) { +; X64-LABEL: test_ctselect_f32_special_values: +; X64: # %bb.0: +; X64-NEXT: testb $1, %dil +; X64-NEXT: movl $2143289344, %eax # imm = 0x7FC00000 +; X64-NEXT: movl $2139095040, %ecx # imm = 0x7F800000 +; X64-NEXT: cmovnel %eax, %ecx +; X64-NEXT: movd %ecx, %xmm0 +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_f32_special_values: +; X32: # %bb.0: +; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: pushl %eax +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %edi, -8 +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: sete %al +; X32-NEXT: movl {{\.?LCPI[0-9]+_[0-9]+}}, %ecx +; X32-NEXT: movl {{\.?LCPI[0-9]+_[0-9]+}}, %edx +; X32-NEXT: movb %al, %ah +; X32-NEXT: movzbl %ah, %edi +; X32-NEXT: negl %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: notl %edi +; X32-NEXT: andl %ecx, %edi +; X32-NEXT: orl %edi, %esi +; X32-NEXT: movl %esi, (%esp) +; X32-NEXT: flds (%esp) +; X32-NEXT: addl $4, %esp +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %edi +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl + %result = call float @llvm.ct.select.f32(i1 %cond, float 0x7FF8000000000000, float 0x7FF0000000000000) + ret float %result +} + +define double @test_ctselect_f64_special_values(i1 %cond) { +; X64-LABEL: test_ctselect_f64_special_values: +; X64: # %bb.0: +; X64-NEXT: testb $1, %dil +; X64-NEXT: movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000 +; X64-NEXT: movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000 +; X64-NEXT: cmovneq %rax, %rcx +; X64-NEXT: movq %rcx, %xmm0 +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_f64_special_values: +; X32: # %bb.0: +; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: subl $24, %esp +; X32-NEXT: .cfi_def_cfa_offset 36 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %edi, -8 +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; X32-NEXT: sete %al +; X32-NEXT: fxch %st(1) +; X32-NEXT: fstpl {{[0-9]+}}(%esp) +; X32-NEXT: fstpl (%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl (%esp), %edx +; X32-NEXT: movb %al, %ah +; X32-NEXT: movzbl %ah, %edi +; X32-NEXT: negl %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: notl %edi +; X32-NEXT: andl %ecx, %edi +; X32-NEXT: orl %edi, %esi +; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movb %al, %ah +; X32-NEXT: movzbl %ah, %edi +; X32-NEXT: negl %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: notl %edi +; X32-NEXT: andl %ecx, %edi +; X32-NEXT: orl %edi, %esi +; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X32-NEXT: fldl {{[0-9]+}}(%esp) +; X32-NEXT: addl $24, %esp +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %edi +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl + %result = call double @llvm.ct.select.f64(i1 %cond, double 0x7FF8000000000000, double 0x7FF0000000000000) + ret double %result +} + +; Test with null pointers +define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { +; X64-LABEL: test_ctselect_null_ptr: +; X64: # %bb.0: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovneq %rsi, %rax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_null_ptr: +; X32: # %bb.0: +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null) + ret ptr %result +} + +; Test with function pointers +define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) { +; X64-LABEL: test_ctselect_function_ptr: +; X64: # %bb.0: +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovneq %rsi, %rax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_function_ptr: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2) + ret ptr %result +} + +; Test with volatile loads +define i32 @test_ctselect_volatile_load(i1 %cond, ptr %p1, ptr %p2) { +; X64-LABEL: test_ctselect_volatile_load: +; X64: # %bb.0: +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: movl (%rdx), %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_volatile_load: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl (%ecx), %ecx +; X32-NEXT: movl (%eax), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel %ecx, %eax +; X32-NEXT: retl + %a = load volatile i32, ptr %p1 + %b = load volatile i32, ptr %p2 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test with atomic loads +define i32 @test_ctselect_atomic_load(i1 %cond, ptr %p1, ptr %p2) { +; X64-LABEL: test_ctselect_atomic_load: +; X64: # %bb.0: +; X64-NEXT: movl (%rsi), %ecx +; X64-NEXT: movl (%rdx), %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_atomic_load: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl (%ecx), %ecx +; X32-NEXT: movl (%eax), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel %ecx, %eax +; X32-NEXT: retl + %a = load atomic i32, ptr %p1 acquire, align 4 + %b = load atomic i32, ptr %p2 acquire, align 4 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test with condition from icmp on pointers +define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) { +; X64-LABEL: test_ctselect_ptr_cmp: +; X64: # %bb.0: +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: cmpq %rsi, %rdi +; X64-NEXT: sete %cl +; X64-NEXT: testb %cl, %cl +; X64-NEXT: cmovneq %rdx, %rax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_ptr_cmp: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: sete %cl +; X32-NEXT: testb %cl, %cl +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl + %cmp = icmp eq ptr %p1, %p2 + %result = call ptr @llvm.ct.select.p0(i1 %cmp, ptr %a, ptr %b) + ret ptr %result +} + +; Test with struct pointer types (struct types themselves may not be directly supported) +%struct.pair = type { i32, i32 } + +define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) { +; X64-LABEL: test_ctselect_struct_ptr: +; X64: # %bb.0: +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovneq %rsi, %rax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_struct_ptr: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) + ret ptr %result +} + +; Test with deeply nested conditions (stress test for instruction selection) +define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { +; X64-LABEL: test_ctselect_deeply_nested: +; X64: # %bb.0: +; X64-NEXT: movl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; X64-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %r8d, %r9d +; X64-NEXT: testb $1, %sil +; X64-NEXT: cmovnel %r9d, %r11d +; X64-NEXT: testb $1, %dl +; X64-NEXT: cmovnel %r11d, %r10d +; X64-NEXT: testb $1, %cl +; X64-NEXT: cmovnel %r10d, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_deeply_nested: +; X32: # %bb.0: +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %esi, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %esi +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel %esi, %edx +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel %edx, %ecx +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel %ecx, %eax +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl + %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) + %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c) + %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d) + %sel4 = call i32 @llvm.ct.select.i32(i1 %c4, i32 %sel3, i32 %e) + ret i32 %sel4 +} + +; Test with misaligned loads +define i32 @test_ctselect_misaligned_load(i1 %cond, ptr %p1, ptr %p2) { +; X64-LABEL: test_ctselect_misaligned_load: +; X64: # %bb.0: +; X64-NEXT: movl (%rdx), %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel (%rsi), %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_misaligned_load: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl (%eax), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel (%ecx), %eax +; X32-NEXT: retl + %a = load i32, ptr %p1, align 1 + %b = load i32, ptr %p2, align 1 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Declare the intrinsics +declare i1 @llvm.ct.select.i1(i1, i1, i1) +declare i128 @llvm.ct.select.i128(i1, i128, i128) +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare float @llvm.ct.select.f32(i1, float, float) +declare double @llvm.ct.select.f64(i1, double, double) +declare ptr @llvm.ct.select.p0(i1, ptr, ptr) diff --git a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll new file mode 100644 index 0000000000000..ea943307c644f --- /dev/null +++ b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll @@ -0,0 +1,722 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=I386-NOCMOV +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=I386-CMOV +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix=I386-NOCMOV + +; Comprehensive CTSELECT tests for i386 targets with floating-point types +; - Without CMOV: constant-time implementation using FP->int conversion + existing post-RA CTSELECT +; - With CMOV: CMOV-based implementation +; - Verifies security properties: no conditional branches, constant execution time +; Strategy: FP values stored to memory, converted to integers, CTSELECT on integers, converted back to FP + +; Test basic f32 functionality +define float @test_ctselect_f32_basic(i1 %cond, float %a, float %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f32_basic: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: pushl %eax +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $4, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f32_basic: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: pushl %eax +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: flds (%esp) +; I386-CMOV-NEXT: addl $4, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi +; I386-CMOV-NEXT: retl + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +; Test f32 with different condition codes +define float @test_ctselect_f32_eq(float %x, float %y, float %a, float %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f32_eq: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: pushl %eax +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fucompp +; I386-NOCMOV-NEXT: fnstsw %ax +; I386-NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax +; I386-NOCMOV-NEXT: sahf +; I386-NOCMOV-NEXT: setnp %al +; I386-NOCMOV-NEXT: sete %cl +; I386-NOCMOV-NEXT: testb %al, %cl +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $4, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f32_eq: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: pushl %eax +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fucompi %st(1), %st +; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: setnp %al +; I386-CMOV-NEXT: sete %cl +; I386-CMOV-NEXT: testb %al, %cl +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: flds (%esp) +; I386-CMOV-NEXT: addl $4, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi +; I386-CMOV-NEXT: retl + %cmp = fcmp oeq float %x, %y + %result = call float @llvm.ct.select.f32(i1 %cmp, float %a, float %b) + ret float %result +} + +; Test basic f64 functionality +define double @test_ctselect_f64_basic(i1 %cond, double %a, double %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f64_basic: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: subl $8, %esp +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fldl (%esp) +; I386-NOCMOV-NEXT: addl $8, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f64_basic: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: subl $8, %esp +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fldl (%esp) +; I386-CMOV-NEXT: addl $8, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi +; I386-CMOV-NEXT: retl + %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) + ret double %result +} + +; Test basic x86_fp80 functionality +define x86_fp80 @test_ctselect_f80_basic(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f80_basic: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: subl $12, %esp +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fldt (%esp) +; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f80_basic: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: subl $12, %esp +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fldt (%esp) +; I386-CMOV-NEXT: addl $12, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi +; I386-CMOV-NEXT: retl + %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b) + ret x86_fp80 %result +} + +; Test f32 with complex conditions +define float @test_ctselect_f32_gt(float %x, float %y, float %a, float %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f32_gt: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: pushl %eax +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fucompp +; I386-NOCMOV-NEXT: fnstsw %ax +; I386-NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax +; I386-NOCMOV-NEXT: sahf +; I386-NOCMOV-NEXT: seta %al +; I386-NOCMOV-NEXT: testb %al, %al +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $4, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f32_gt: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: pushl %eax +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fucompi %st(1), %st +; I386-CMOV-NEXT: fstp %st(0) +; I386-CMOV-NEXT: seta %al +; I386-CMOV-NEXT: testb %al, %al +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: flds (%esp) +; I386-CMOV-NEXT: addl $4, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi +; I386-CMOV-NEXT: retl + %cmp = fcmp ogt float %x, %y + %result = call float @llvm.ct.select.f32(i1 %cmp, float %a, float %b) + ret float %result +} + +; Test constant-time properties: verify no branches in generated code +define float @test_ctselect_f32_no_branches(i1 %cond, float %a, float %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f32_no_branches: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: pushl %eax +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $4, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f32_no_branches: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: pushl %eax +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: flds (%esp) +; I386-CMOV-NEXT: addl $4, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi +; I386-CMOV-NEXT: retl + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +; Test that BUNDLE directives are present for constant-time guarantees +define float @test_ctselect_f32_bundled(i1 %cond, float %a, float %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f32_bundled: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: pushl %eax +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $4, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f32_bundled: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: pushl %eax +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: flds (%esp) +; I386-CMOV-NEXT: addl $4, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi +; I386-CMOV-NEXT: retl + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +; Test edge case: NaN handling +define float @test_ctselect_f32_nan(i1 %cond) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f32_nan: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: subl $12, %esp +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; I386-NOCMOV-NEXT: fldz +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: fxch %st(1) +; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fstps (%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl (%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f32_nan: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: subl $12, %esp +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; I386-CMOV-NEXT: fldz +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: fxch %st(1) +; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fstps (%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl (%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: addl $12, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi +; I386-CMOV-NEXT: retl + %nan = bitcast i32 2139095040 to float ; 0x7F800000 = +inf + %zero = bitcast i32 0 to float + %result = call float @llvm.ct.select.f32(i1 %cond, float %nan, float %zero) + ret float %result +} + +; Test memory alignment for f80 +define x86_fp80 @test_ctselect_f80_alignment(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f80_alignment: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: subl $12, %esp +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: fldt (%esp) +; I386-NOCMOV-NEXT: addl $12, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f80_alignment: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: subl $12, %esp +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: fldt (%esp) +; I386-CMOV-NEXT: addl $12, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi +; I386-CMOV-NEXT: retl + %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b) + ret x86_fp80 %result +} + +; Stress test: multiple CTSELECT operations +define float @test_ctselect_f32_multiple(i1 %cond1, i1 %cond2, float %a, float %b, float %c, float %d) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_f32_multiple: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: subl $8, %esp +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movb %al, %ah +; I386-NOCMOV-NEXT: movzbl %ah, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %ecx, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: movl %esi, (%esp) +; I386-NOCMOV-NEXT: flds (%esp) +; I386-NOCMOV-NEXT: addl $8, %esp +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_f32_multiple: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %edi +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: subl $8, %esp +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %al +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movb %al, %ah +; I386-CMOV-NEXT: movzbl %ah, %edi +; I386-CMOV-NEXT: negl %edi +; I386-CMOV-NEXT: movl %edx, %esi +; I386-CMOV-NEXT: andl %edi, %esi +; I386-CMOV-NEXT: notl %edi +; I386-CMOV-NEXT: andl %ecx, %edi +; I386-CMOV-NEXT: orl %edi, %esi +; I386-CMOV-NEXT: movl %esi, (%esp) +; I386-CMOV-NEXT: flds (%esp) +; I386-CMOV-NEXT: addl $8, %esp +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: popl %edi +; I386-CMOV-NEXT: retl + %sel1 = call float @llvm.ct.select.f32(i1 %cond1, float %a, float %b) + %sel2 = call float @llvm.ct.select.f32(i1 %cond2, float %sel1, float %c) + ret float %sel2 +} + +; Declare intrinsics +declare float @llvm.ct.select.f32(i1, float, float) +declare double @llvm.ct.select.f64(i1, double, double) +declare x86_fp80 @llvm.ct.select.f80(i1, x86_fp80, x86_fp80) diff --git a/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll b/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll new file mode 100644 index 0000000000000..2cb67ba9c29b5 --- /dev/null +++ b/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll @@ -0,0 +1,418 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-cmov,+mmx < %s | FileCheck %s --check-prefix=I386-NOCMOV +; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+cmov,+mmx < %s | FileCheck %s --check-prefix=I386-CMOV +; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-cmov,+mmx -verify-machineinstrs < %s | FileCheck %s --check-prefix=I386-NOCMOV + +; Test constant-time selection with MMX intrinsics to exercise VR64 CTSELECT +; These tests use MMX intrinsics to create <1 x i64> values that get allocated to VR64 registers + +; Test MMX ct.select using paddd intrinsic to force VR64 allocation +define <1 x i64> @test_mmx_ctselect_with_paddd(i32 %cond, i64 %a, i64 %b) { +; I386-NOCMOV-LABEL: test_mmx_ctselect_with_paddd: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: subl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40 +; I386-NOCMOV-NEXT: .cfi_offset %esi, -20 +; I386-NOCMOV-NEXT: .cfi_offset %edi, -16 +; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12 +; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8 +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setne %bl +; I386-NOCMOV-NEXT: testb %bl, %bl +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %ebp +; I386-NOCMOV-NEXT: negl %ebp +; I386-NOCMOV-NEXT: movl %esi, %edi +; I386-NOCMOV-NEXT: andl %ebp, %edi +; I386-NOCMOV-NEXT: notl %ebp +; I386-NOCMOV-NEXT: andl %ecx, %ebp +; I386-NOCMOV-NEXT: orl %ebp, %edi +; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edx, %ecx +; I386-NOCMOV-NEXT: andl %esi, %ecx +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %eax, %esi +; I386-NOCMOV-NEXT: orl %esi, %ecx +; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-NOCMOV-NEXT: paddd %mm0, %mm0 +; I386-NOCMOV-NEXT: movq %mm0, (%esp) +; I386-NOCMOV-NEXT: movl (%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: addl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: popl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_mmx_ctselect_with_paddd: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: subl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 24 +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: setne %dl +; I386-CMOV-NEXT: testb %dl, %dl +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-CMOV-NEXT: paddd %mm0, %mm0 +; I386-CMOV-NEXT: movq %mm0, (%esp) +; I386-CMOV-NEXT: movl (%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: addl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-CMOV-NEXT: retl + %mmx_a = bitcast i64 %a to <1 x i64> + %mmx_b = bitcast i64 %b to <1 x i64> + %cmp = icmp ne i32 %cond, 0 + %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b) + %result = call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %sel, <1 x i64> %sel) + ret <1 x i64> %result +} + +; Test MMX ct.select using psllw intrinsic +define <1 x i64> @test_mmx_ctselect_with_psllw(i32 %cond, i64 %a, i64 %b) { +; I386-NOCMOV-LABEL: test_mmx_ctselect_with_psllw: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: subl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40 +; I386-NOCMOV-NEXT: .cfi_offset %esi, -20 +; I386-NOCMOV-NEXT: .cfi_offset %edi, -16 +; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12 +; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8 +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setne %bl +; I386-NOCMOV-NEXT: testb %bl, %bl +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %ebp +; I386-NOCMOV-NEXT: negl %ebp +; I386-NOCMOV-NEXT: movl %esi, %edi +; I386-NOCMOV-NEXT: andl %ebp, %edi +; I386-NOCMOV-NEXT: notl %ebp +; I386-NOCMOV-NEXT: andl %ecx, %ebp +; I386-NOCMOV-NEXT: orl %ebp, %edi +; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edx, %ecx +; I386-NOCMOV-NEXT: andl %esi, %ecx +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %eax, %esi +; I386-NOCMOV-NEXT: orl %esi, %ecx +; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-NOCMOV-NEXT: psllw %mm0, %mm0 +; I386-NOCMOV-NEXT: movq %mm0, (%esp) +; I386-NOCMOV-NEXT: movl (%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: addl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: popl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_mmx_ctselect_with_psllw: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: subl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 24 +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: setne %dl +; I386-CMOV-NEXT: testb %dl, %dl +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-CMOV-NEXT: psllw %mm0, %mm0 +; I386-CMOV-NEXT: movq %mm0, (%esp) +; I386-CMOV-NEXT: movl (%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: addl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-CMOV-NEXT: retl + %mmx_a = bitcast i64 %a to <1 x i64> + %mmx_b = bitcast i64 %b to <1 x i64> + %cmp = icmp ne i32 %cond, 0 + %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b) + %result = call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %sel, <1 x i64> %sel) + ret <1 x i64> %result +} + +; Test nested MMX ct.selects with pand intrinsic +define <1 x i64> @test_mmx_nested_ctselect_with_pand(i32 %cond1, i32 %cond2, i64 %a, i64 %b, i64 %c) { +; I386-NOCMOV-LABEL: test_mmx_nested_ctselect_with_pand: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: subl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40 +; I386-NOCMOV-NEXT: .cfi_offset %esi, -20 +; I386-NOCMOV-NEXT: .cfi_offset %edi, -16 +; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12 +; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8 +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setne %bl +; I386-NOCMOV-NEXT: testb %bl, %bl +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %ebp +; I386-NOCMOV-NEXT: negl %ebp +; I386-NOCMOV-NEXT: movl %edx, %edi +; I386-NOCMOV-NEXT: andl %ebp, %edi +; I386-NOCMOV-NEXT: notl %ebp +; I386-NOCMOV-NEXT: andl %eax, %ebp +; I386-NOCMOV-NEXT: orl %ebp, %edi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %ecx +; I386-NOCMOV-NEXT: negl %ecx +; I386-NOCMOV-NEXT: movl %esi, %ebp +; I386-NOCMOV-NEXT: andl %ecx, %ebp +; I386-NOCMOV-NEXT: notl %ecx +; I386-NOCMOV-NEXT: andl %eax, %ecx +; I386-NOCMOV-NEXT: orl %ecx, %ebp +; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setne %cl +; I386-NOCMOV-NEXT: testb %cl, %cl +; I386-NOCMOV-NEXT: sete %cl +; I386-NOCMOV-NEXT: movb %cl, %ch +; I386-NOCMOV-NEXT: movzbl %ch, %ebx +; I386-NOCMOV-NEXT: negl %ebx +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %ebx, %esi +; I386-NOCMOV-NEXT: notl %ebx +; I386-NOCMOV-NEXT: andl %ebp, %ebx +; I386-NOCMOV-NEXT: orl %ebx, %esi +; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movb %cl, %ch +; I386-NOCMOV-NEXT: movzbl %ch, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %eax, %edx +; I386-NOCMOV-NEXT: andl %esi, %edx +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: orl %esi, %edx +; I386-NOCMOV-NEXT: movl %edx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-NOCMOV-NEXT: pand %mm0, %mm0 +; I386-NOCMOV-NEXT: movq %mm0, (%esp) +; I386-NOCMOV-NEXT: movl (%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: addl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: popl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_mmx_nested_ctselect_with_pand: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: pushl %ebx +; I386-CMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-CMOV-NEXT: pushl %esi +; I386-CMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-CMOV-NEXT: subl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 32 +; I386-CMOV-NEXT: .cfi_offset %esi, -12 +; I386-CMOV-NEXT: .cfi_offset %ebx, -8 +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: setne %bl +; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: setne %bh +; I386-CMOV-NEXT: testb %bh, %bh +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %esi +; I386-CMOV-NEXT: testb %bl, %bl +; I386-CMOV-NEXT: cmovnel %esi, %edx +; I386-CMOV-NEXT: movl %edx, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel %ecx, %eax +; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-CMOV-NEXT: pand %mm0, %mm0 +; I386-CMOV-NEXT: movq %mm0, (%esp) +; I386-CMOV-NEXT: movl (%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: addl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-CMOV-NEXT: popl %esi +; I386-CMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-CMOV-NEXT: popl %ebx +; I386-CMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-CMOV-NEXT: retl + %mmx_a = bitcast i64 %a to <1 x i64> + %mmx_b = bitcast i64 %b to <1 x i64> + %mmx_c = bitcast i64 %c to <1 x i64> + %cmp1 = icmp ne i32 %cond1, 0 + %cmp2 = icmp ne i32 %cond2, 0 + %sel1 = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp2, <1 x i64> %mmx_a, <1 x i64> %mmx_b) + %sel2 = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp1, <1 x i64> %sel1, <1 x i64> %mmx_c) + %result = call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> %sel2, <1 x i64> %sel2) + ret <1 x i64> %result +} + +; Test MMX ct.select with por intrinsic +define <1 x i64> @test_mmx_ctselect_with_por(i32 %cond, i64 %a, i64 %b) { +; I386-NOCMOV-LABEL: test_mmx_ctselect_with_por: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: subl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40 +; I386-NOCMOV-NEXT: .cfi_offset %esi, -20 +; I386-NOCMOV-NEXT: .cfi_offset %edi, -16 +; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12 +; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8 +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setne %bl +; I386-NOCMOV-NEXT: testb %bl, %bl +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %ebp +; I386-NOCMOV-NEXT: negl %ebp +; I386-NOCMOV-NEXT: movl %esi, %edi +; I386-NOCMOV-NEXT: andl %ebp, %edi +; I386-NOCMOV-NEXT: notl %ebp +; I386-NOCMOV-NEXT: andl %ecx, %ebp +; I386-NOCMOV-NEXT: orl %ebp, %edi +; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edx, %ecx +; I386-NOCMOV-NEXT: andl %esi, %ecx +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %eax, %esi +; I386-NOCMOV-NEXT: orl %esi, %ecx +; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-NOCMOV-NEXT: por %mm0, %mm0 +; I386-NOCMOV-NEXT: movq %mm0, (%esp) +; I386-NOCMOV-NEXT: movl (%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: addl $20, %esp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; I386-NOCMOV-NEXT: popl %ebp +; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_mmx_ctselect_with_por: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: subl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 24 +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: setne %dl +; I386-CMOV-NEXT: testb %dl, %dl +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; I386-CMOV-NEXT: por %mm0, %mm0 +; I386-CMOV-NEXT: movq %mm0, (%esp) +; I386-CMOV-NEXT: movl (%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-CMOV-NEXT: addl $20, %esp +; I386-CMOV-NEXT: .cfi_def_cfa_offset 4 +; I386-CMOV-NEXT: retl + %mmx_a = bitcast i64 %a to <1 x i64> + %mmx_b = bitcast i64 %b to <1 x i64> + %cmp = icmp ne i32 %cond, 0 + %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b) + %result = call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %sel, <1 x i64> %sel) + ret <1 x i64> %result +} + +; Declare MMX intrinsics +declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.pand(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>) + +; Declare constant-time selection intrinsic +declare <1 x i64> @llvm.ct.select.v1i64(i1, <1 x i64>, <1 x i64>) diff --git a/llvm/test/CodeGen/X86/ctselect-i386.ll b/llvm/test/CodeGen/X86/ctselect-i386.ll new file mode 100644 index 0000000000000..d7345f1121540 --- /dev/null +++ b/llvm/test/CodeGen/X86/ctselect-i386.ll @@ -0,0 +1,267 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=I386-NOCMOV +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=I386-CMOV +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix=I386-NOCMOV + +; Comprehensive CTSELECT tests for i386 targets with scalar integer types +; - Without CMOV: constant-time implementation using post-RA expansion with bundled instructions +; - With CMOV: CMOV-based implementation +; - Verifies security properties: no conditional branches, constant execution time +; All expansion happens post-RA for better optimization control and constant-time guarantees + +; Test basic i32 functionality +define i32 @test_ctselect_i32_basic(i1 %cond, i32 %a, i32 %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_i32_basic: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edx, %eax +; I386-NOCMOV-NEXT: andl %esi, %eax +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %ecx, %esi +; I386-NOCMOV-NEXT: orl %esi, %eax +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_i32_basic: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: retl + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test i16 functionality +define i16 @test_ctselect_i16_basic(i1 %cond, i16 %a, i16 %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_i16_basic: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbw %bh, %si +; I386-NOCMOV-NEXT: negw %si +; I386-NOCMOV-NEXT: movw %dx, %ax +; I386-NOCMOV-NEXT: andw %si, %ax +; I386-NOCMOV-NEXT: notw %si +; I386-NOCMOV-NEXT: andw %cx, %si +; I386-NOCMOV-NEXT: orw %si, %ax +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_i16_basic: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnew {{[0-9]+}}(%esp), %ax +; I386-CMOV-NEXT: retl + %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) + ret i16 %result +} + +; Test i8 functionality +define i8 @test_ctselect_i8_basic(i1 %cond, i8 %a, i8 %b) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_i8_basic: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %ah +; I386-NOCMOV-NEXT: movb %ah, %ch +; I386-NOCMOV-NEXT: negb %ch +; I386-NOCMOV-NEXT: movb %dl, %al +; I386-NOCMOV-NEXT: andb %ch, %al +; I386-NOCMOV-NEXT: notb %ch +; I386-NOCMOV-NEXT: andb %cl, %ch +; I386-NOCMOV-NEXT: orb %ch, %al +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_i8_basic: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: # kill: def $al killed $al killed $eax +; I386-CMOV-NEXT: retl + %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) + ret i8 %result +} + +; Test security property: constant-time execution for cryptographic use case +define i32 @test_crypto_key_select(i32 %secret_bit, i32 %key1, i32 %key2) nounwind { +; I386-NOCMOV-LABEL: test_crypto_key_select: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setne %al +; I386-NOCMOV-NEXT: testb %al, %al +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edx, %eax +; I386-NOCMOV-NEXT: andl %esi, %eax +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %ecx, %esi +; I386-NOCMOV-NEXT: orl %esi, %eax +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_crypto_key_select: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: setne %cl +; I386-CMOV-NEXT: testb %cl, %cl +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: retl + %cond = icmp ne i32 %secret_bit, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %key1, i32 %key2) + ret i32 %result +} + +; Test that no conditional branches appear in constant-time path +define i32 @test_no_conditional_branches(i32 %secret, i32 %val1, i32 %val2) nounwind { +; I386-NOCMOV-LABEL: test_no_conditional_branches: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: setne %al +; I386-NOCMOV-NEXT: testb %al, %al +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edx, %eax +; I386-NOCMOV-NEXT: andl %esi, %eax +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %ecx, %esi +; I386-NOCMOV-NEXT: orl %esi, %eax +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_no_conditional_branches: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: setne %cl +; I386-CMOV-NEXT: testb %cl, %cl +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: retl + %cond = icmp ne i32 %secret, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %val1, i32 %val2) + ret i32 %result +} + +; Test with comparison condition +define i32 @test_ctselect_i32_cmp(i32 %a, i32 %b, i32 %c) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_i32_cmp: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: cmpl %edx, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %al +; I386-NOCMOV-NEXT: testb %al, %al +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %esi +; I386-NOCMOV-NEXT: negl %esi +; I386-NOCMOV-NEXT: movl %edx, %eax +; I386-NOCMOV-NEXT: andl %esi, %eax +; I386-NOCMOV-NEXT: notl %esi +; I386-NOCMOV-NEXT: andl %ecx, %esi +; I386-NOCMOV-NEXT: orl %esi, %eax +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_i32_cmp: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: cmpl %eax, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: sete %cl +; I386-CMOV-NEXT: testb %cl, %cl +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: retl + %cond = icmp eq i32 %a, %c + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %b, i32 %c) + ret i32 %result +} + +; Test nested selects +define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) nounwind { +; I386-NOCMOV-LABEL: test_ctselect_nested: +; I386-NOCMOV: # %bb.0: +; I386-NOCMOV-NEXT: pushl %ebx +; I386-NOCMOV-NEXT: pushl %edi +; I386-NOCMOV-NEXT: pushl %esi +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %bl +; I386-NOCMOV-NEXT: movb %bl, %bh +; I386-NOCMOV-NEXT: movzbl %bh, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %edx, %esi +; I386-NOCMOV-NEXT: andl %edi, %esi +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %eax, %edi +; I386-NOCMOV-NEXT: orl %edi, %esi +; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-NOCMOV-NEXT: sete %dl +; I386-NOCMOV-NEXT: movb %dl, %dh +; I386-NOCMOV-NEXT: movzbl %dh, %edi +; I386-NOCMOV-NEXT: negl %edi +; I386-NOCMOV-NEXT: movl %ecx, %eax +; I386-NOCMOV-NEXT: andl %edi, %eax +; I386-NOCMOV-NEXT: notl %edi +; I386-NOCMOV-NEXT: andl %esi, %edi +; I386-NOCMOV-NEXT: orl %edi, %eax +; I386-NOCMOV-NEXT: popl %esi +; I386-NOCMOV-NEXT: popl %edi +; I386-NOCMOV-NEXT: popl %ebx +; I386-NOCMOV-NEXT: retl +; +; I386-CMOV-LABEL: test_ctselect_nested: +; I386-CMOV: # %bb.0: +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; I386-CMOV-NEXT: cmovnel %ecx, %eax +; I386-CMOV-NEXT: retl + %sel1 = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b) + %sel2 = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %sel1, i32 %c) + ret i32 %sel2 +} + +; Declare ct.select intrinsics +declare i8 @llvm.ct.select.i8(i1, i8, i8) +declare i16 @llvm.ct.select.i16(i1, i16, i16) +declare i32 @llvm.ct.select.i32(i1, i32, i32) diff --git a/llvm/test/CodeGen/X86/ctselect-optimization.ll b/llvm/test/CodeGen/X86/ctselect-optimization.ll new file mode 100644 index 0000000000000..481d49971a937 --- /dev/null +++ b/llvm/test/CodeGen/X86/ctselect-optimization.ll @@ -0,0 +1,304 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov | FileCheck %s + +; Test ct.select optimization patterns + +; Test smin(x, 0) pattern optimization +define i32 @test_ctselect_smin_zero(i32 %x) { +; CHECK-LABEL: test_ctselect_smin_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: sets %cl +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) + ret i32 %result +} + +; Test smax(x, 0) pattern optimization +define i32 @test_ctselect_smax_zero(i32 %x) { +; CHECK-LABEL: test_ctselect_smax_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: setg %cl +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %cmp = icmp sgt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) + ret i32 %result +} + +; Test generic smin pattern +define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) { +; CHECK-LABEL: test_ctselect_smin_generic: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: cmpl %esi, %edi +; CHECK-NEXT: setl %cl +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %cmp = icmp slt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test generic smax pattern +define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) { +; CHECK-LABEL: test_ctselect_smax_generic: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: cmpl %esi, %edi +; CHECK-NEXT: setg %cl +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %cmp = icmp sgt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test umin pattern +define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) { +; CHECK-LABEL: test_ctselect_umin_generic: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: cmpl %esi, %edi +; CHECK-NEXT: setb %cl +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %cmp = icmp ult i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test umax pattern +define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) { +; CHECK-LABEL: test_ctselect_umax_generic: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: cmpl %esi, %edi +; CHECK-NEXT: seta %cl +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %cmp = icmp ugt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test abs pattern +define i32 @test_ctselect_abs(i32 %x) { +; CHECK-LABEL: test_ctselect_abs: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: negl %ecx +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: sets %dl +; CHECK-NEXT: testb %dl, %dl +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: retq + %neg = sub i32 0, %x + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %neg, i32 %x) + ret i32 %result +} + +; Test nabs pattern (negative abs) +define i32 @test_ctselect_nabs(i32 %x) { +; CHECK-LABEL: test_ctselect_nabs: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: negl %eax +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: sets %cl +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %neg = sub i32 0, %x + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %neg) + ret i32 %result +} + +; Test sign extension pattern +define i32 @test_ctselect_sign_extend(i32 %x) { +; CHECK-LABEL: test_ctselect_sign_extend: +; CHECK: # %bb.0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: sets %cl +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: movl $-1, %ecx +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: retq + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0) + ret i32 %result +} + +; Test zero extension pattern +define i32 @test_ctselect_zero_extend(i32 %x) { +; CHECK-LABEL: test_ctselect_zero_extend: +; CHECK: # %bb.0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: setne %cl +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: movl $1, %ecx +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: retq + %cmp = icmp ne i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0) + ret i32 %result +} + +; Test mask generation pattern +define i32 @test_ctselect_mask_generation(i32 %x) { +; CHECK-LABEL: test_ctselect_mask_generation: +; CHECK: # %bb.0: +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: sets %cl +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: movl $-1, %ecx +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: retq + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0) + ret i32 %result +} + +; Test constant folding with known condition +define i32 @test_ctselect_constant_folding_true(i32 %a, i32 %b) { +; CHECK-LABEL: test_ctselect_constant_folding_true: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: movb $1, %cl +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) { +; CHECK-LABEL: test_ctselect_constant_folding_false: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %edi, %eax +; CHECK-NEXT: retq + %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) + ret i32 %result +} + +; Test with identical operands +define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) { +; CHECK-LABEL: test_ctselect_identical_operands: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: cmovnel %esi, %eax +; CHECK-NEXT: retq + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x) + ret i32 %result +} + +; Test with inverted condition +define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) { +; CHECK-LABEL: test_ctselect_inverted_condition: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: cmpl %esi, %edi +; CHECK-NEXT: sete %dl +; CHECK-NEXT: testb %dl, %dl +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: retq + %cmp = icmp eq i32 %x, %y + %not_cmp = xor i1 %cmp, true + %result = call i32 @llvm.ct.select.i32(i1 %not_cmp, i32 %a, i32 %b) + ret i32 %result +} + +; Test for 64-bit specific optimizations +define i64 @test_ctselect_i64_smin_zero(i64 %x) { +; CHECK-LABEL: test_ctselect_i64_smin_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: sets %cl +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovneq %rdi, %rax +; CHECK-NEXT: retq + %cmp = icmp slt i64 %x, 0 + %result = call i64 @llvm.ct.select.i64(i1 %cmp, i64 %x, i64 0) + ret i64 %result +} + +; Test for floating point optimizations +define float @test_ctselect_f32_zero_positive(float %x) { +; CHECK-LABEL: test_ctselect_f32_zero_positive: +; CHECK: # %bb.0: +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 +; CHECK-NEXT: seta %cl +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovnel %eax, %edx +; CHECK-NEXT: movd %edx, %xmm0 +; CHECK-NEXT: retq + %cmp = fcmp ogt float %x, 0.0 + %result = call float @llvm.ct.select.f32(i1 %cmp, float %x, float 0.0) + ret float %result +} + +define double @test_ctselect_f64_zero_positive(double %x) { +; CHECK-LABEL: test_ctselect_f64_zero_positive: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: xorpd %xmm1, %xmm1 +; CHECK-NEXT: ucomisd %xmm1, %xmm0 +; CHECK-NEXT: seta %cl +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovneq %rax, %rdx +; CHECK-NEXT: movq %rdx, %xmm0 +; CHECK-NEXT: retq + %cmp = fcmp ogt double %x, 0.0 + %result = call double @llvm.ct.select.f64(i1 %cmp, double %x, double 0.0) + ret double %result +} + +; Test chain of ct.select operations +define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, i32 %d) { +; CHECK-LABEL: test_ctselect_chain: +; CHECK: # %bb.0: +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: cmovnel %ecx, %r8d +; CHECK-NEXT: testb $1, %sil +; CHECK-NEXT: cmovnel %r8d, %r9d +; CHECK-NEXT: testb $1, %dl +; CHECK-NEXT: cmovnel %r9d, %eax +; CHECK-NEXT: retq + %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) + %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c) + %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d) + ret i32 %sel3 +} + +; Declare the intrinsics +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare i64 @llvm.ct.select.i64(i1, i64, i64) +declare float @llvm.ct.select.f32(i1, float, float) +declare double @llvm.ct.select.f64(i1, double, double) diff --git a/llvm/test/CodeGen/X86/ctselect-vector.ll b/llvm/test/CodeGen/X86/ctselect-vector.ll new file mode 100644 index 0000000000000..2206e32cd6d34 --- /dev/null +++ b/llvm/test/CodeGen/X86/ctselect-vector.ll @@ -0,0 +1,1274 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 + +; Test ct.select functionality for vector types + +; 128-bit vectors +define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: test_ctselect_v4i32: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %xmm3, %xmm3 +; AVX-NEXT: movd %eax, %xmm3 +; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX-NEXT: movdqa %xmm3, %xmm2 +; AVX-NEXT: pand %xmm0, %xmm3 +; AVX-NEXT: pandn %xmm1, %xmm2 +; AVX-NEXT: por %xmm3, %xmm2 +; AVX-NEXT: vmovaps %xmm2, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %xmm3, %xmm3 +; AVX2-NEXT: movd %eax, %xmm3 +; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX2-NEXT: movdqa %xmm3, %xmm2 +; AVX2-NEXT: pand %xmm0, %xmm3 +; AVX2-NEXT: pandn %xmm1, %xmm2 +; AVX2-NEXT: por %xmm3, %xmm2 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB0_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %xmm0, %xmm1 +; AVX512-NEXT: .LBB0_2: +; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: retq + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: test_ctselect_v4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: movaps %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %xmm3, %xmm3 +; AVX-NEXT: movd %eax, %xmm3 +; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX-NEXT: movdqa %xmm3, %xmm2 +; AVX-NEXT: pand %xmm0, %xmm3 +; AVX-NEXT: pandn %xmm1, %xmm2 +; AVX-NEXT: por %xmm3, %xmm2 +; AVX-NEXT: vmovaps %xmm2, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %xmm3, %xmm3 +; AVX2-NEXT: movd %eax, %xmm3 +; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX2-NEXT: movdqa %xmm3, %xmm2 +; AVX2-NEXT: pand %xmm0, %xmm3 +; AVX2-NEXT: pandn %xmm1, %xmm2 +; AVX2-NEXT: por %xmm3, %xmm2 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB1_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %xmm0, %xmm1 +; AVX512-NEXT: .LBB1_2: +; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: retq + %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) + ret <4 x float> %result +} + +define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) { +; SSE2-LABEL: test_ctselect_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %xmm3, %xmm3 +; AVX-NEXT: movd %eax, %xmm3 +; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX-NEXT: movdqa %xmm3, %xmm2 +; AVX-NEXT: pand %xmm0, %xmm3 +; AVX-NEXT: pandn %xmm1, %xmm2 +; AVX-NEXT: por %xmm3, %xmm2 +; AVX-NEXT: vmovaps %xmm2, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %xmm3, %xmm3 +; AVX2-NEXT: movd %eax, %xmm3 +; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX2-NEXT: movdqa %xmm3, %xmm2 +; AVX2-NEXT: pand %xmm0, %xmm3 +; AVX2-NEXT: pandn %xmm1, %xmm2 +; AVX2-NEXT: por %xmm3, %xmm2 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB2_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %xmm0, %xmm1 +; AVX512-NEXT: .LBB2_2: +; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: retq + %result = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) + ret <2 x i64> %result +} + +define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) { +; SSE2-LABEL: test_ctselect_v2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: movapd %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v2f64: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %xmm3, %xmm3 +; AVX-NEXT: movd %eax, %xmm3 +; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX-NEXT: movdqa %xmm3, %xmm2 +; AVX-NEXT: pand %xmm0, %xmm3 +; AVX-NEXT: pandn %xmm1, %xmm2 +; AVX-NEXT: por %xmm3, %xmm2 +; AVX-NEXT: vmovaps %xmm2, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %xmm3, %xmm3 +; AVX2-NEXT: movd %eax, %xmm3 +; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX2-NEXT: movdqa %xmm3, %xmm2 +; AVX2-NEXT: pand %xmm0, %xmm3 +; AVX2-NEXT: pandn %xmm1, %xmm2 +; AVX2-NEXT: por %xmm3, %xmm2 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB3_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovapd %xmm0, %xmm1 +; AVX512-NEXT: .LBB3_2: +; AVX512-NEXT: vmovapd %xmm1, %xmm0 +; AVX512-NEXT: retq + %result = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) + ret <2 x double> %result +} + +; 256-bit vectors +define <8 x i32> @test_ctselect_v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) { +; SSE2-LABEL: test_ctselect_v8i32: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm4, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v8i32: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm3, %ymm3 +; AVX-NEXT: vmovd %eax, %ymm3 +; AVX-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] +; AVX-NEXT: vmovdqa %ymm3, %ymm2 +; AVX-NEXT: pand %ymm0, %ymm3 +; AVX-NEXT: pandn %ymm1, %ymm2 +; AVX-NEXT: por %ymm3, %ymm2 +; AVX-NEXT: vmovaps %ymm2, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm3, %ymm3 +; AVX2-NEXT: vmovd %eax, %ymm3 +; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmovdqa %ymm3, %ymm2 +; AVX2-NEXT: pand %ymm0, %ymm3 +; AVX2-NEXT: pandn %ymm1, %ymm2 +; AVX2-NEXT: por %ymm3, %ymm2 +; AVX2-NEXT: vmovaps %ymm2, %ymm0 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB4_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %ymm0, %ymm1 +; AVX512-NEXT: .LBB4_2: +; AVX512-NEXT: vmovaps %ymm1, %ymm0 +; AVX512-NEXT: retq + %result = call <8 x i32> @llvm.ct.select.v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) + ret <8 x i32> %result +} + +define <8 x float> @test_ctselect_v8f32(i1 %cond, <8 x float> %a, <8 x float> %b) { +; SSE2-LABEL: test_ctselect_v8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] +; SSE2-NEXT: movaps %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm4, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v8f32: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm3, %ymm3 +; AVX-NEXT: vmovd %eax, %ymm3 +; AVX-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] +; AVX-NEXT: vmovdqa %ymm3, %ymm2 +; AVX-NEXT: pand %ymm0, %ymm3 +; AVX-NEXT: pandn %ymm1, %ymm2 +; AVX-NEXT: por %ymm3, %ymm2 +; AVX-NEXT: vmovaps %ymm2, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm3, %ymm3 +; AVX2-NEXT: vmovd %eax, %ymm3 +; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmovdqa %ymm3, %ymm2 +; AVX2-NEXT: pand %ymm0, %ymm3 +; AVX2-NEXT: pandn %ymm1, %ymm2 +; AVX2-NEXT: por %ymm3, %ymm2 +; AVX2-NEXT: vmovaps %ymm2, %ymm0 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB5_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %ymm0, %ymm1 +; AVX512-NEXT: .LBB5_2: +; AVX512-NEXT: vmovaps %ymm1, %ymm0 +; AVX512-NEXT: retq + %result = call <8 x float> @llvm.ct.select.v8f32(i1 %cond, <8 x float> %a, <8 x float> %b) + ret <8 x float> %result +} + +define <4 x i64> @test_ctselect_v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b) { +; SSE2-LABEL: test_ctselect_v4i64: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm4, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v4i64: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm3, %ymm3 +; AVX-NEXT: vmovd %eax, %ymm3 +; AVX-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2] +; AVX-NEXT: vmovdqa %ymm3, %ymm2 +; AVX-NEXT: pand %ymm0, %ymm3 +; AVX-NEXT: pandn %ymm1, %ymm2 +; AVX-NEXT: por %ymm3, %ymm2 +; AVX-NEXT: vmovaps %ymm2, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm3, %ymm3 +; AVX2-NEXT: vmovd %eax, %ymm3 +; AVX2-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2] +; AVX2-NEXT: vmovdqa %ymm3, %ymm2 +; AVX2-NEXT: pand %ymm0, %ymm3 +; AVX2-NEXT: pandn %ymm1, %ymm2 +; AVX2-NEXT: por %ymm3, %ymm2 +; AVX2-NEXT: vmovaps %ymm2, %ymm0 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB6_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %ymm0, %ymm1 +; AVX512-NEXT: .LBB6_2: +; AVX512-NEXT: vmovaps %ymm1, %ymm0 +; AVX512-NEXT: retq + %result = call <4 x i64> @llvm.ct.select.v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b) + ret <4 x i64> %result +} + +define <4 x double> @test_ctselect_v4f64(i1 %cond, <4 x double> %a, <4 x double> %b) { +; SSE2-LABEL: test_ctselect_v4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] +; SSE2-NEXT: movapd %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movapd %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm4, %xmm0 +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v4f64: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm3, %ymm3 +; AVX-NEXT: vmovd %eax, %ymm3 +; AVX-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2] +; AVX-NEXT: vmovdqa %ymm3, %ymm2 +; AVX-NEXT: pand %ymm0, %ymm3 +; AVX-NEXT: pandn %ymm1, %ymm2 +; AVX-NEXT: por %ymm3, %ymm2 +; AVX-NEXT: vmovaps %ymm2, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm3, %ymm3 +; AVX2-NEXT: vmovd %eax, %ymm3 +; AVX2-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2] +; AVX2-NEXT: vmovdqa %ymm3, %ymm2 +; AVX2-NEXT: pand %ymm0, %ymm3 +; AVX2-NEXT: pandn %ymm1, %ymm2 +; AVX2-NEXT: por %ymm3, %ymm2 +; AVX2-NEXT: vmovaps %ymm2, %ymm0 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB7_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovapd %ymm0, %ymm1 +; AVX512-NEXT: .LBB7_2: +; AVX512-NEXT: vmovapd %ymm1, %ymm0 +; AVX512-NEXT: retq + %result = call <4 x double> @llvm.ct.select.v4f64(i1 %cond, <4 x double> %a, <4 x double> %b) + ret <4 x double> %result +} + +; 512-bit vectors (AVX512 only) +define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) { +; SSE2-LABEL: test_ctselect_v16i32: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: movd %eax, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] +; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm9 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm9, %xmm8 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm7, %xmm6 +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: movaps %xmm8, %xmm0 +; SSE2-NEXT: movaps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm5, %xmm2 +; SSE2-NEXT: movaps %xmm6, %xmm3 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v16i32: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm5, %ymm5 +; AVX-NEXT: vmovd %eax, %ymm5 +; AVX-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] +; AVX-NEXT: vmovdqa %ymm5, %ymm4 +; AVX-NEXT: pand %ymm0, %ymm5 +; AVX-NEXT: pandn %ymm2, %ymm4 +; AVX-NEXT: por %ymm5, %ymm4 +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm0, %ymm0 +; AVX-NEXT: vmovd %eax, %ymm0 +; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX-NEXT: vmovdqa %ymm0, %ymm2 +; AVX-NEXT: pand %ymm1, %ymm0 +; AVX-NEXT: pandn %ymm3, %ymm2 +; AVX-NEXT: por %ymm0, %ymm2 +; AVX-NEXT: vmovaps %ymm4, %ymm0 +; AVX-NEXT: vmovaps %ymm2, %ymm1 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm5, %ymm5 +; AVX2-NEXT: vmovd %eax, %ymm5 +; AVX2-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmovdqa %ymm5, %ymm4 +; AVX2-NEXT: pand %ymm0, %ymm5 +; AVX2-NEXT: pandn %ymm2, %ymm4 +; AVX2-NEXT: por %ymm5, %ymm4 +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm0, %ymm0 +; AVX2-NEXT: vmovd %eax, %ymm0 +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmovdqa %ymm0, %ymm2 +; AVX2-NEXT: pand %ymm1, %ymm0 +; AVX2-NEXT: pandn %ymm3, %ymm2 +; AVX2-NEXT: por %ymm0, %ymm2 +; AVX2-NEXT: vmovaps %ymm4, %ymm0 +; AVX2-NEXT: vmovaps %ymm2, %ymm1 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB8_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %zmm0, %zmm1 +; AVX512-NEXT: .LBB8_2: +; AVX512-NEXT: vmovaps %zmm1, %zmm0 +; AVX512-NEXT: retq + %result = call <16 x i32> @llvm.ct.select.v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) + ret <16 x i32> %result +} + +define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float> %b) { +; SSE2-LABEL: test_ctselect_v16f32: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: movd %eax, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] +; SSE2-NEXT: movaps %xmm9, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm9 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm9, %xmm8 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movaps %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movaps %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movaps %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm7, %xmm6 +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: movaps %xmm8, %xmm0 +; SSE2-NEXT: movaps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm5, %xmm2 +; SSE2-NEXT: movaps %xmm6, %xmm3 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v16f32: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm5, %ymm5 +; AVX-NEXT: vmovd %eax, %ymm5 +; AVX-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] +; AVX-NEXT: vmovdqa %ymm5, %ymm4 +; AVX-NEXT: pand %ymm0, %ymm5 +; AVX-NEXT: pandn %ymm2, %ymm4 +; AVX-NEXT: por %ymm5, %ymm4 +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm0, %ymm0 +; AVX-NEXT: vmovd %eax, %ymm0 +; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX-NEXT: vmovdqa %ymm0, %ymm2 +; AVX-NEXT: pand %ymm1, %ymm0 +; AVX-NEXT: pandn %ymm3, %ymm2 +; AVX-NEXT: por %ymm0, %ymm2 +; AVX-NEXT: vmovaps %ymm4, %ymm0 +; AVX-NEXT: vmovaps %ymm2, %ymm1 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v16f32: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm5, %ymm5 +; AVX2-NEXT: vmovd %eax, %ymm5 +; AVX2-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmovdqa %ymm5, %ymm4 +; AVX2-NEXT: pand %ymm0, %ymm5 +; AVX2-NEXT: pandn %ymm2, %ymm4 +; AVX2-NEXT: por %ymm5, %ymm4 +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm0, %ymm0 +; AVX2-NEXT: vmovd %eax, %ymm0 +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmovdqa %ymm0, %ymm2 +; AVX2-NEXT: pand %ymm1, %ymm0 +; AVX2-NEXT: pandn %ymm3, %ymm2 +; AVX2-NEXT: por %ymm0, %ymm2 +; AVX2-NEXT: vmovaps %ymm4, %ymm0 +; AVX2-NEXT: vmovaps %ymm2, %ymm1 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v16f32: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB9_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %zmm0, %zmm1 +; AVX512-NEXT: .LBB9_2: +; AVX512-NEXT: vmovaps %zmm1, %zmm0 +; AVX512-NEXT: retq + %result = call <16 x float> @llvm.ct.select.v16f32(i1 %cond, <16 x float> %a, <16 x float> %b) + ret <16 x float> %result +} + +define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) { +; SSE2-LABEL: test_ctselect_v8i64: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: movd %eax, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] +; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm9 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm9, %xmm8 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm7, %xmm6 +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: movaps %xmm8, %xmm0 +; SSE2-NEXT: movaps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm5, %xmm2 +; SSE2-NEXT: movaps %xmm6, %xmm3 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v8i64: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm5, %ymm5 +; AVX-NEXT: vmovd %eax, %ymm5 +; AVX-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2] +; AVX-NEXT: vmovdqa %ymm5, %ymm4 +; AVX-NEXT: pand %ymm0, %ymm5 +; AVX-NEXT: pandn %ymm2, %ymm4 +; AVX-NEXT: por %ymm5, %ymm4 +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm0, %ymm0 +; AVX-NEXT: vmovd %eax, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX-NEXT: vmovdqa %ymm0, %ymm2 +; AVX-NEXT: pand %ymm1, %ymm0 +; AVX-NEXT: pandn %ymm3, %ymm2 +; AVX-NEXT: por %ymm0, %ymm2 +; AVX-NEXT: vmovaps %ymm4, %ymm0 +; AVX-NEXT: vmovaps %ymm2, %ymm1 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm5, %ymm5 +; AVX2-NEXT: vmovd %eax, %ymm5 +; AVX2-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2] +; AVX2-NEXT: vmovdqa %ymm5, %ymm4 +; AVX2-NEXT: pand %ymm0, %ymm5 +; AVX2-NEXT: pandn %ymm2, %ymm4 +; AVX2-NEXT: por %ymm5, %ymm4 +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm0, %ymm0 +; AVX2-NEXT: vmovd %eax, %ymm0 +; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX2-NEXT: vmovdqa %ymm0, %ymm2 +; AVX2-NEXT: pand %ymm1, %ymm0 +; AVX2-NEXT: pandn %ymm3, %ymm2 +; AVX2-NEXT: por %ymm0, %ymm2 +; AVX2-NEXT: vmovaps %ymm4, %ymm0 +; AVX2-NEXT: vmovaps %ymm2, %ymm1 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v8i64: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB10_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %zmm0, %zmm1 +; AVX512-NEXT: .LBB10_2: +; AVX512-NEXT: vmovaps %zmm1, %zmm0 +; AVX512-NEXT: retq + %result = call <8 x i64> @llvm.ct.select.v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) + ret <8 x i64> %result +} + +define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> %b) { +; SSE2-LABEL: test_ctselect_v8f64: +; SSE2: # %bb.0: +; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: movd %eax, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] +; SSE2-NEXT: movapd %xmm9, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm9 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm9, %xmm8 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movapd %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movapd %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movapd %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm7, %xmm6 +; SSE2-NEXT: por %xmm0, %xmm6 +; SSE2-NEXT: movaps %xmm8, %xmm0 +; SSE2-NEXT: movaps %xmm4, %xmm1 +; SSE2-NEXT: movaps %xmm5, %xmm2 +; SSE2-NEXT: movaps %xmm6, %xmm3 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v8f64: +; AVX: # %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm5, %ymm5 +; AVX-NEXT: vmovd %eax, %ymm5 +; AVX-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2] +; AVX-NEXT: vmovdqa %ymm5, %ymm4 +; AVX-NEXT: pand %ymm0, %ymm5 +; AVX-NEXT: pandn %ymm2, %ymm4 +; AVX-NEXT: por %ymm5, %ymm4 +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %ymm0, %ymm0 +; AVX-NEXT: vmovd %eax, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX-NEXT: vmovdqa %ymm0, %ymm2 +; AVX-NEXT: pand %ymm1, %ymm0 +; AVX-NEXT: pandn %ymm3, %ymm2 +; AVX-NEXT: por %ymm0, %ymm2 +; AVX-NEXT: vmovaps %ymm4, %ymm0 +; AVX-NEXT: vmovaps %ymm2, %ymm1 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v8f64: +; AVX2: # %bb.0: +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm5, %ymm5 +; AVX2-NEXT: vmovd %eax, %ymm5 +; AVX2-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2] +; AVX2-NEXT: vmovdqa %ymm5, %ymm4 +; AVX2-NEXT: pand %ymm0, %ymm5 +; AVX2-NEXT: pandn %ymm2, %ymm4 +; AVX2-NEXT: por %ymm5, %ymm4 +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %ymm0, %ymm0 +; AVX2-NEXT: vmovd %eax, %ymm0 +; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX2-NEXT: vmovdqa %ymm0, %ymm2 +; AVX2-NEXT: pand %ymm1, %ymm0 +; AVX2-NEXT: pandn %ymm3, %ymm2 +; AVX2-NEXT: por %ymm0, %ymm2 +; AVX2-NEXT: vmovaps %ymm4, %ymm0 +; AVX2-NEXT: vmovaps %ymm2, %ymm1 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v8f64: +; AVX512: # %bb.0: +; AVX512-NEXT: testb %dil, %dil +; AVX512-NEXT: je .LBB11_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovapd %zmm0, %zmm1 +; AVX512-NEXT: .LBB11_2: +; AVX512-NEXT: vmovapd %zmm1, %zmm0 +; AVX512-NEXT: retq + %result = call <8 x double> @llvm.ct.select.v8f64(i1 %cond, <8 x double> %a, <8 x double> %b) + ret <8 x double> %result +} + +; Test with constant conditions for vector types +define <4 x i32> @test_ctselect_v4i32_const_true(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: test_ctselect_v4i32_const_true: +; SSE2: # %bb.0: +; SSE2-NEXT: movb $1, %al +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v4i32_const_true: +; AVX: # %bb.0: +; AVX-NEXT: movb $1, %al +; AVX-NEXT: testb %al, %al +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %xmm3, %xmm3 +; AVX-NEXT: movd %eax, %xmm3 +; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX-NEXT: movdqa %xmm3, %xmm2 +; AVX-NEXT: pand %xmm0, %xmm3 +; AVX-NEXT: pandn %xmm1, %xmm2 +; AVX-NEXT: por %xmm3, %xmm2 +; AVX-NEXT: vmovaps %xmm2, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v4i32_const_true: +; AVX2: # %bb.0: +; AVX2-NEXT: movb $1, %al +; AVX2-NEXT: testb %al, %al +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %xmm3, %xmm3 +; AVX2-NEXT: movd %eax, %xmm3 +; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX2-NEXT: movdqa %xmm3, %xmm2 +; AVX2-NEXT: pand %xmm0, %xmm3 +; AVX2-NEXT: pandn %xmm1, %xmm2 +; AVX2-NEXT: por %xmm3, %xmm2 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v4i32_const_true: +; AVX512: # %bb.0: +; AVX512-NEXT: retq + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 true, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +define <4 x i32> @test_ctselect_v4i32_const_false(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: test_ctselect_v4i32_const_false: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v4i32_const_false: +; AVX: # %bb.0: +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: testb %al, %al +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %xmm3, %xmm3 +; AVX-NEXT: movd %eax, %xmm3 +; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX-NEXT: movdqa %xmm3, %xmm2 +; AVX-NEXT: pand %xmm0, %xmm3 +; AVX-NEXT: pandn %xmm1, %xmm2 +; AVX-NEXT: por %xmm3, %xmm2 +; AVX-NEXT: vmovaps %xmm2, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v4i32_const_false: +; AVX2: # %bb.0: +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: testb %al, %al +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %xmm3, %xmm3 +; AVX2-NEXT: movd %eax, %xmm3 +; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX2-NEXT: movdqa %xmm3, %xmm2 +; AVX2-NEXT: pand %xmm0, %xmm3 +; AVX2-NEXT: pandn %xmm1, %xmm2 +; AVX2-NEXT: por %xmm3, %xmm2 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v4i32_const_false: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: retq + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 false, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +; Test with comparison conditions for vector types +define <4 x i32> @test_ctselect_v4i32_icmp(i32 %x, i32 %y, <4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: test_ctselect_v4i32_icmp: +; SSE2: # %bb.0: +; SSE2-NEXT: cmpl %esi, %edi +; SSE2-NEXT: sete %al +; SSE2-NEXT: testb %al, %al +; SSE2-NEXT: movl $0, %eax +; SSE2-NEXT: setne %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_ctselect_v4i32_icmp: +; AVX: # %bb.0: +; AVX-NEXT: cmpl %esi, %edi +; AVX-NEXT: sete %al +; AVX-NEXT: testb %al, %al +; AVX-NEXT: movl $0, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: negl %eax +; AVX-NEXT: pxor %xmm3, %xmm3 +; AVX-NEXT: movd %eax, %xmm3 +; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX-NEXT: movdqa %xmm3, %xmm2 +; AVX-NEXT: pand %xmm0, %xmm3 +; AVX-NEXT: pandn %xmm1, %xmm2 +; AVX-NEXT: por %xmm3, %xmm2 +; AVX-NEXT: vmovaps %xmm2, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test_ctselect_v4i32_icmp: +; AVX2: # %bb.0: +; AVX2-NEXT: cmpl %esi, %edi +; AVX2-NEXT: sete %al +; AVX2-NEXT: testb %al, %al +; AVX2-NEXT: movl $0, %eax +; AVX2-NEXT: setne %al +; AVX2-NEXT: movzbl %al, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: pxor %xmm3, %xmm3 +; AVX2-NEXT: movd %eax, %xmm3 +; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX2-NEXT: movdqa %xmm3, %xmm2 +; AVX2-NEXT: pand %xmm0, %xmm3 +; AVX2-NEXT: pandn %xmm1, %xmm2 +; AVX2-NEXT: por %xmm3, %xmm2 +; AVX2-NEXT: vmovaps %xmm2, %xmm0 +; AVX2-NEXT: retq +; AVX512-LABEL: test_ctselect_v4i32_icmp: +; AVX512: # %bb.0: +; AVX512-NEXT: cmpl %esi, %edi +; AVX512-NEXT: je .LBB14_2 +; AVX512-NEXT: # %bb.1: +; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: .LBB14_2: +; AVX512-NEXT: retq + %cond = icmp eq i32 %x, %y + %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %result +} + +; Declare the intrinsics +declare <4 x i32> @llvm.ct.select.v4i32(i1, <4 x i32>, <4 x i32>) +declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>) +declare <2 x i64> @llvm.ct.select.v2i64(i1, <2 x i64>, <2 x i64>) +declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>) +declare <8 x i32> @llvm.ct.select.v8i32(i1, <8 x i32>, <8 x i32>) +declare <8 x float> @llvm.ct.select.v8f32(i1, <8 x float>, <8 x float>) +declare <4 x i64> @llvm.ct.select.v4i64(i1, <4 x i64>, <4 x i64>) +declare <4 x double> @llvm.ct.select.v4f64(i1, <4 x double>, <4 x double>) +declare <16 x i32> @llvm.ct.select.v16i32(i1, <16 x i32>, <16 x i32>) +declare <16 x float> @llvm.ct.select.v16f32(i1, <16 x float>, <16 x float>) +declare <8 x i64> @llvm.ct.select.v8i64(i1, <8 x i64>, <8 x i64>) +declare <8 x double> @llvm.ct.select.v8f64(i1, <8 x double>, <8 x double>) diff --git a/llvm/test/CodeGen/X86/ctselect.ll b/llvm/test/CodeGen/X86/ctselect.ll new file mode 100644 index 0000000000000..3f6276add0a5c --- /dev/null +++ b/llvm/test/CodeGen/X86/ctselect.ll @@ -0,0 +1,946 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=X32-NOCMOV + +; Test basic ct.select functionality for scalar types + +define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) { +; X64-LABEL: test_ctselect_i8: +; X64: # %bb.0: +; X64-NEXT: movl %edx, %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %esi, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_i8: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: # kill: def $al killed $al killed $eax +; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_i8: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %ah +; X32-NOCMOV-NEXT: movb %ah, %ch +; X32-NOCMOV-NEXT: negb %ch +; X32-NOCMOV-NEXT: movb %dl, %al +; X32-NOCMOV-NEXT: andb %ch, %al +; X32-NOCMOV-NEXT: notb %ch +; X32-NOCMOV-NEXT: andb %cl, %ch +; X32-NOCMOV-NEXT: orb %ch, %al +; X32-NOCMOV-NEXT: retl + %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b) + ret i8 %result +} + +define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) { +; X64-LABEL: test_ctselect_i16: +; X64: # %bb.0: +; X64-NEXT: movl %edx, %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %esi, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_i16: +; X32: # %bb.0: +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnew {{[0-9]+}}(%esp), %ax +; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_i16: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbw %bh, %si +; X32-NOCMOV-NEXT: negw %si +; X32-NOCMOV-NEXT: movw %dx, %ax +; X32-NOCMOV-NEXT: andw %si, %ax +; X32-NOCMOV-NEXT: notw %si +; X32-NOCMOV-NEXT: andw %cx, %si +; X32-NOCMOV-NEXT: orw %si, %ax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl + %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b) + ret i16 %result +} + +define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) { +; X64-LABEL: test_ctselect_i32: +; X64: # %bb.0: +; X64-NEXT: movl %edx, %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %esi, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_i32: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_i32: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) { +; X64-LABEL: test_ctselect_i64: +; X64: # %bb.0: +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovneq %rsi, %rax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_i64: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edx +; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_i64: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: pushl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -20 +; X32-NOCMOV-NEXT: .cfi_offset %edi, -16 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebp, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %ebp +; X32-NOCMOV-NEXT: negl %ebp +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %ebp, %eax +; X32-NOCMOV-NEXT: notl %ebp +; X32-NOCMOV-NEXT: andl %ecx, %ebp +; X32-NOCMOV-NEXT: orl %ebp, %eax +; X32-NOCMOV-NEXT: movb %bl, %cl +; X32-NOCMOV-NEXT: movzbl %cl, %ebp +; X32-NOCMOV-NEXT: negl %ebp +; X32-NOCMOV-NEXT: movl %edi, %edx +; X32-NOCMOV-NEXT: andl %ebp, %edx +; X32-NOCMOV-NEXT: notl %ebp +; X32-NOCMOV-NEXT: andl %esi, %ebp +; X32-NOCMOV-NEXT: orl %ebp, %edx +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; X32-NOCMOV-NEXT: popl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl + %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b) + ret i64 %result +} + +define float @test_ctselect_f32(i1 %cond, float %a, float %b) { +; X64-LABEL: test_ctselect_f32: +; X64: # %bb.0: +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: movd %xmm1, %ecx +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %eax, %ecx +; X64-NEXT: movd %ecx, %xmm0 +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_f32: +; X32: # %bb.0: +; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: pushl %eax +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %edi, -8 +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: sete %al +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movb %al, %ah +; X32-NEXT: movzbl %ah, %edi +; X32-NEXT: negl %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: notl %edi +; X32-NEXT: andl %ecx, %edi +; X32-NEXT: orl %edi, %esi +; X32-NEXT: movl %esi, (%esp) +; X32-NEXT: flds (%esp) +; X32-NEXT: addl $4, %esp +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %edi +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_f32: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: pushl %eax +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %edi, -8 +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %al +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movb %al, %ah +; X32-NOCMOV-NEXT: movzbl %ah, %edi +; X32-NOCMOV-NEXT: negl %edi +; X32-NOCMOV-NEXT: movl %edx, %esi +; X32-NOCMOV-NEXT: andl %edi, %esi +; X32-NOCMOV-NEXT: notl %edi +; X32-NOCMOV-NEXT: andl %ecx, %edi +; X32-NOCMOV-NEXT: orl %edi, %esi +; X32-NOCMOV-NEXT: movl %esi, (%esp) +; X32-NOCMOV-NEXT: flds (%esp) +; X32-NOCMOV-NEXT: addl $4, %esp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +define double @test_ctselect_f64(i1 %cond, double %a, double %b) { +; X64-LABEL: test_ctselect_f64: +; X64: # %bb.0: +; X64-NEXT: movq %xmm0, %rax +; X64-NEXT: movq %xmm1, %rcx +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovneq %rax, %rcx +; X64-NEXT: movq %rcx, %xmm0 +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_f64: +; X32: # %bb.0: +; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: subl $8, %esp +; X32-NEXT: .cfi_def_cfa_offset 20 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %edi, -8 +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: sete %al +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movb %al, %ah +; X32-NEXT: movzbl %ah, %edi +; X32-NEXT: negl %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: notl %edi +; X32-NEXT: andl %ecx, %edi +; X32-NEXT: orl %edi, %esi +; X32-NEXT: movl %esi, (%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movb %al, %ah +; X32-NEXT: movzbl %ah, %edi +; X32-NEXT: negl %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: notl %edi +; X32-NEXT: andl %ecx, %edi +; X32-NEXT: orl %edi, %esi +; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X32-NEXT: fldl (%esp) +; X32-NEXT: addl $8, %esp +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %edi +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_f64: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: subl $8, %esp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 20 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %edi, -8 +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %al +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movb %al, %ah +; X32-NOCMOV-NEXT: movzbl %ah, %edi +; X32-NOCMOV-NEXT: negl %edi +; X32-NOCMOV-NEXT: movl %edx, %esi +; X32-NOCMOV-NEXT: andl %edi, %esi +; X32-NOCMOV-NEXT: notl %edi +; X32-NOCMOV-NEXT: andl %ecx, %edi +; X32-NOCMOV-NEXT: orl %edi, %esi +; X32-NOCMOV-NEXT: movl %esi, (%esp) +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movb %al, %ah +; X32-NOCMOV-NEXT: movzbl %ah, %edi +; X32-NOCMOV-NEXT: negl %edi +; X32-NOCMOV-NEXT: movl %edx, %esi +; X32-NOCMOV-NEXT: andl %edi, %esi +; X32-NOCMOV-NEXT: notl %edi +; X32-NOCMOV-NEXT: andl %ecx, %edi +; X32-NOCMOV-NEXT: orl %edi, %esi +; X32-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: fldl (%esp) +; X32-NOCMOV-NEXT: addl $8, %esp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl + %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b) + ret double %result +} + +define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) { +; X64-LABEL: test_ctselect_ptr: +; X64: # %bb.0: +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovneq %rsi, %rax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_ptr: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_ptr: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) + ret ptr %result +} + +; Test with constant conditions +define i32 @test_ctselect_const_true(i32 %a, i32 %b) { +; X64-LABEL: test_ctselect_const_true: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movb $1, %cl +; X64-NEXT: testb %cl, %cl +; X64-NEXT: cmovnel %edi, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_const_true: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movb $1, %cl +; X32-NEXT: testb %cl, %cl +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_const_true: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movb $1, %al +; X32-NOCMOV-NEXT: testb %al, %al +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl + %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_const_false(i32 %a, i32 %b) { +; X64-LABEL: test_ctselect_const_false: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: testb %cl, %cl +; X64-NEXT: cmovnel %edi, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_const_false: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: xorl %ecx, %ecx +; X32-NEXT: testb %cl, %cl +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_const_false: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: xorl %eax, %eax +; X32-NOCMOV-NEXT: testb %al, %al +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl + %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) + ret i32 %result +} + +; Test with comparison conditions +define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) { +; X64-LABEL: test_ctselect_icmp_eq: +; X64: # %bb.0: +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: sete %cl +; X64-NEXT: testb %cl, %cl +; X64-NEXT: cmovnel %edx, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_icmp_eq: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: sete %cl +; X32-NEXT: testb %cl, %cl +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_icmp_eq: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: sete %al +; X32-NOCMOV-NEXT: testb %al, %al +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl + %cond = icmp eq i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) { +; X64-LABEL: test_ctselect_icmp_ne: +; X64: # %bb.0: +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: setne %cl +; X64-NEXT: testb %cl, %cl +; X64-NEXT: cmovnel %edx, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_icmp_ne: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: setne %cl +; X32-NEXT: testb %cl, %cl +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_icmp_ne: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: setne %al +; X32-NOCMOV-NEXT: testb %al, %al +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl + %cond = icmp ne i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) { +; X64-LABEL: test_ctselect_icmp_slt: +; X64: # %bb.0: +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: setl %cl +; X64-NEXT: testb %cl, %cl +; X64-NEXT: cmovnel %edx, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_icmp_slt: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: setl %cl +; X32-NEXT: testb %cl, %cl +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_icmp_slt: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: setl %al +; X32-NOCMOV-NEXT: testb %al, %al +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl + %cond = icmp slt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) { +; X64-LABEL: test_ctselect_icmp_ult: +; X64: # %bb.0: +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: setb %cl +; X64-NEXT: testb %cl, %cl +; X64-NEXT: cmovnel %edx, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_icmp_ult: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: setb %cl +; X32-NEXT: testb %cl, %cl +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax +; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_icmp_ult: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: setb %al +; X32-NOCMOV-NEXT: testb %al, %al +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl + %cond = icmp ult i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) { +; X64-LABEL: test_ctselect_fcmp_oeq: +; X64: # %bb.0: +; X64-NEXT: movd %xmm2, %eax +; X64-NEXT: movd %xmm3, %ecx +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: setnp %dl +; X64-NEXT: sete %sil +; X64-NEXT: testb %dl, %sil +; X64-NEXT: cmovnel %eax, %ecx +; X64-NEXT: movd %ecx, %xmm0 +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_fcmp_oeq: +; X32: # %bb.0: +; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: pushl %eax +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: .cfi_offset %esi, -12 +; X32-NEXT: .cfi_offset %edi, -8 +; X32-NEXT: flds {{[0-9]+}}(%esp) +; X32-NEXT: flds {{[0-9]+}}(%esp) +; X32-NEXT: fucompi %st(1), %st +; X32-NEXT: fstp %st(0) +; X32-NEXT: setnp %al +; X32-NEXT: sete %cl +; X32-NEXT: testb %al, %cl +; X32-NEXT: sete %al +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movb %al, %ah +; X32-NEXT: movzbl %ah, %edi +; X32-NEXT: negl %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: andl %edi, %esi +; X32-NEXT: notl %edi +; X32-NEXT: andl %ecx, %edi +; X32-NEXT: orl %edi, %esi +; X32-NEXT: movl %esi, (%esp) +; X32-NEXT: flds (%esp) +; X32-NEXT: addl $4, %esp +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %edi +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_fcmp_oeq: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: pushl %eax +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %edi, -8 +; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: fucompp +; X32-NOCMOV-NEXT: fnstsw %ax +; X32-NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax +; X32-NOCMOV-NEXT: sahf +; X32-NOCMOV-NEXT: setnp %al +; X32-NOCMOV-NEXT: sete %cl +; X32-NOCMOV-NEXT: testb %al, %cl +; X32-NOCMOV-NEXT: sete %al +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: movb %al, %ah +; X32-NOCMOV-NEXT: movzbl %ah, %edi +; X32-NOCMOV-NEXT: negl %edi +; X32-NOCMOV-NEXT: movl %edx, %esi +; X32-NOCMOV-NEXT: andl %edi, %esi +; X32-NOCMOV-NEXT: notl %edi +; X32-NOCMOV-NEXT: andl %ecx, %edi +; X32-NOCMOV-NEXT: orl %edi, %esi +; X32-NOCMOV-NEXT: movl %esi, (%esp) +; X32-NOCMOV-NEXT: flds (%esp) +; X32-NOCMOV-NEXT: addl $4, %esp +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl + %cond = fcmp oeq float %x, %y + %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b) + ret float %result +} + +; Test with memory operands +define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) { +; X64-LABEL: test_ctselect_load: +; X64: # %bb.0: +; X64-NEXT: movl (%rdx), %eax +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel (%rsi), %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_load: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl (%eax), %eax +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel (%ecx), %eax +; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_load: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl (%ecx), %ecx +; X32-NOCMOV-NEXT: movl (%eax), %edx +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %esi +; X32-NOCMOV-NEXT: negl %esi +; X32-NOCMOV-NEXT: movl %edx, %eax +; X32-NOCMOV-NEXT: andl %esi, %eax +; X32-NOCMOV-NEXT: notl %esi +; X32-NOCMOV-NEXT: andl %ecx, %esi +; X32-NOCMOV-NEXT: orl %esi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl + %a = load i32, ptr %p1 + %b = load i32, ptr %p2 + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test nested ctselect calls +define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) { +; X64-LABEL: test_ctselect_nested: +; X64: # %bb.0: +; X64-NEXT: movl %r8d, %eax +; X64-NEXT: testb $1, %sil +; X64-NEXT: cmovnel %edx, %ecx +; X64-NEXT: testb $1, %dil +; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_ctselect_nested: +; X32: # %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx +; X32-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NEXT: cmovnel %ecx, %eax +; X32-NEXT: retl +; +; X32-NOCMOV-LABEL: test_ctselect_nested: +; X32-NOCMOV: # %bb.0: +; X32-NOCMOV-NEXT: pushl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: pushl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: pushl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16 +; X32-NOCMOV-NEXT: .cfi_offset %esi, -16 +; X32-NOCMOV-NEXT: .cfi_offset %edi, -12 +; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8 +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %bl +; X32-NOCMOV-NEXT: movb %bl, %bh +; X32-NOCMOV-NEXT: movzbl %bh, %edi +; X32-NOCMOV-NEXT: negl %edi +; X32-NOCMOV-NEXT: movl %edx, %esi +; X32-NOCMOV-NEXT: andl %edi, %esi +; X32-NOCMOV-NEXT: notl %edi +; X32-NOCMOV-NEXT: andl %eax, %edi +; X32-NOCMOV-NEXT: orl %edi, %esi +; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp) +; X32-NOCMOV-NEXT: sete %dl +; X32-NOCMOV-NEXT: movb %dl, %dh +; X32-NOCMOV-NEXT: movzbl %dh, %edi +; X32-NOCMOV-NEXT: negl %edi +; X32-NOCMOV-NEXT: movl %ecx, %eax +; X32-NOCMOV-NEXT: andl %edi, %eax +; X32-NOCMOV-NEXT: notl %edi +; X32-NOCMOV-NEXT: andl %esi, %edi +; X32-NOCMOV-NEXT: orl %edi, %eax +; X32-NOCMOV-NEXT: popl %esi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12 +; X32-NOCMOV-NEXT: popl %edi +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8 +; X32-NOCMOV-NEXT: popl %ebx +; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4 +; X32-NOCMOV-NEXT: retl + %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b) + %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c) + ret i32 %result +} + +; Declare the intrinsics +declare i8 @llvm.ct.select.i8(i1, i8, i8) +declare i16 @llvm.ct.select.i16(i1, i16, i16) +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare i64 @llvm.ct.select.i64(i1, i64, i64) +declare float @llvm.ct.select.f32(i1, float, float) +declare double @llvm.ct.select.f64(i1, double, double) +declare ptr @llvm.ct.select.p0(i1, ptr, ptr)