diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll new file mode 100644 index 0000000000000..af1be0c8f3ddc --- /dev/null +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll @@ -0,0 +1,214 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv64 -O3 | FileCheck %s --check-prefix=RV64 +; RUN: llc < %s -mtriple=riscv32 -O3 | FileCheck %s --check-prefix=RV32 + +; Test with small integer types +define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) { +; RV64-LABEL: test_ctselect_i1: +; RV64: # %bb.0: +; RV64-NEXT: and a1, a0, a1 +; RV64-NEXT: xori a0, a0, 1 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_i1: +; RV32: # %bb.0: +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: xori a0, a0, 1 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: ret + %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b) + ret i1 %result +} + +; Test with extremal values +define i32 @test_ctselect_extremal_values(i1 %cond) { +; RV64-LABEL: test_ctselect_extremal_values: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: lui a1, 524288 +; RV64-NEXT: subw a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_extremal_values: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: lui a1, 524288 +; RV32-NEXT: addi a2, a0, -1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: srli a0, a0, 1 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648) + ret i32 %result +} + +; Test with null pointers +define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) { +; RV64-LABEL: test_ctselect_null_ptr: +; RV64: # %bb.0: +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_null_ptr: +; RV32: # %bb.0: +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: ret + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null) + ret ptr %result +} + +; Test with function pointers +define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) { +; RV64-LABEL: test_ctselect_function_ptr: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: neg a3, a0 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a1, a3, a1 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_function_ptr: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: neg a3, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a1, a3, a1 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: ret + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2) + ret ptr %result +} + +; Test with condition from icmp on pointers +define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) { +; RV64-LABEL: test_ctselect_ptr_cmp: +; RV64: # %bb.0: +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: snez a0, a0 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a2, a0, a2 +; RV64-NEXT: not a0, a0 +; RV64-NEXT: and a0, a0, a3 +; RV64-NEXT: or a0, a2, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_ptr_cmp: +; RV32: # %bb.0: +; RV32-NEXT: xor a0, a0, a1 +; RV32-NEXT: snez a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a2, a0, a2 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: ret + %cmp = icmp eq ptr %p1, %p2 + %result = call ptr @llvm.ct.select.p0(i1 %cmp, ptr %a, ptr %b) + ret ptr %result +} + +; Test with struct pointer types +%struct.pair = type { i32, i32 } + +define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) { +; RV64-LABEL: test_ctselect_struct_ptr: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: neg a3, a0 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a1, a3, a1 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_struct_ptr: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: neg a3, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a1, a3, a1 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: ret + %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b) + ret ptr %result +} + +; Test with deeply nested conditions +define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { +; RV64-LABEL: test_ctselect_deeply_nested: +; RV64: # %bb.0: +; RV64-NEXT: lw t0, 0(sp) +; RV64-NEXT: xor a4, a4, a5 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: xor a5, a5, a6 +; RV64-NEXT: slli a1, a1, 63 +; RV64-NEXT: xor a6, a6, a7 +; RV64-NEXT: slli a2, a2, 63 +; RV64-NEXT: slli a3, a3, 63 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: srai a1, a1, 63 +; RV64-NEXT: srai a2, a2, 63 +; RV64-NEXT: and a0, a4, a0 +; RV64-NEXT: xor a0, a0, a5 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: xor a1, a7, t0 +; RV64-NEXT: xor a0, a0, a6 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: srai a3, a3, 63 +; RV64-NEXT: and a0, a0, a3 +; RV64-NEXT: xor a0, a0, t0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_deeply_nested: +; RV32: # %bb.0: +; RV32-NEXT: lw t0, 0(sp) +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: andi a1, a1, 1 +; RV32-NEXT: andi a2, a2, 1 +; RV32-NEXT: andi a3, a3, 1 +; RV32-NEXT: neg t1, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a4, t1, a4 +; RV32-NEXT: neg t1, a1 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a0, a0, a5 +; RV32-NEXT: neg a5, a2 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a1, a1, a6 +; RV32-NEXT: neg a6, a3 +; RV32-NEXT: addi a3, a3, -1 +; RV32-NEXT: and a2, a2, a7 +; RV32-NEXT: or a0, a4, a0 +; RV32-NEXT: and a0, t1, a0 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: and a0, a5, a0 +; RV32-NEXT: or a0, a0, a2 +; RV32-NEXT: and a0, a6, a0 +; RV32-NEXT: and a1, a3, t0 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) + %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c) + %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d) + %sel4 = call i32 @llvm.ct.select.i32(i1 %c4, i32 %sel3, i32 %e) + ret i32 %sel4 +} + +; Declare the intrinsics +declare i1 @llvm.ct.select.i1(i1, i1, i1) +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare ptr @llvm.ct.select.p0(i1, ptr, ptr) diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll new file mode 100644 index 0000000000000..1149971fd090e --- /dev/null +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll @@ -0,0 +1,383 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv64 -O3 | FileCheck %s --check-prefix=RV64 +; RUN: llc < %s -mtriple=riscv32 -O3 | FileCheck %s --check-prefix=RV32 + +; Test smin(x, 0) pattern +define i32 @test_ctselect_smin_zero(i32 %x) { +; RV64-LABEL: test_ctselect_smin_zero: +; RV64: # %bb.0: +; RV64-NEXT: sraiw a1, a0, 31 +; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_smin_zero: +; RV32: # %bb.0: +; RV32-NEXT: srai a1, a0, 31 +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: ret + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) + ret i32 %result +} + +; Test smax(x, 0) pattern +define i32 @test_ctselect_smax_zero(i32 %x) { +; RV64-LABEL: test_ctselect_smax_zero: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a1, a0 +; RV64-NEXT: sgtz a1, a1 +; RV64-NEXT: neg a1, a1 +; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_smax_zero: +; RV32: # %bb.0: +; RV32-NEXT: sgtz a1, a0 +; RV32-NEXT: neg a1, a1 +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: ret + %cmp = icmp sgt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0) + ret i32 %result +} + +; Test generic smin pattern +define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) { +; RV64-LABEL: test_ctselect_smin_generic: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a2, a1 +; RV64-NEXT: sext.w a3, a0 +; RV64-NEXT: slt a2, a3, a2 +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: neg a2, a2 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_smin_generic: +; RV32: # %bb.0: +; RV32-NEXT: slt a2, a0, a1 +; RV32-NEXT: neg a3, a2 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a0, a3, a0 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %cmp = icmp slt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test generic smax pattern +define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) { +; RV64-LABEL: test_ctselect_smax_generic: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a2, a0 +; RV64-NEXT: sext.w a3, a1 +; RV64-NEXT: slt a2, a3, a2 +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: neg a2, a2 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_smax_generic: +; RV32: # %bb.0: +; RV32-NEXT: slt a2, a1, a0 +; RV32-NEXT: neg a3, a2 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a0, a3, a0 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %cmp = icmp sgt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test umin pattern +define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) { +; RV64-LABEL: test_ctselect_umin_generic: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a2, a1 +; RV64-NEXT: sext.w a3, a0 +; RV64-NEXT: sltu a2, a3, a2 +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: neg a2, a2 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_umin_generic: +; RV32: # %bb.0: +; RV32-NEXT: sltu a2, a0, a1 +; RV32-NEXT: neg a3, a2 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a0, a3, a0 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %cmp = icmp ult i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test umax pattern +define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) { +; RV64-LABEL: test_ctselect_umax_generic: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a2, a0 +; RV64-NEXT: sext.w a3, a1 +; RV64-NEXT: sltu a2, a3, a2 +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: neg a2, a2 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_umax_generic: +; RV32: # %bb.0: +; RV32-NEXT: sltu a2, a1, a0 +; RV32-NEXT: neg a3, a2 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a0, a3, a0 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %cmp = icmp ugt i32 %x, %y + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y) + ret i32 %result +} + +; Test abs pattern +define i32 @test_ctselect_abs(i32 %x) { +; RV64-LABEL: test_ctselect_abs: +; RV64: # %bb.0: +; RV64-NEXT: negw a1, a0 +; RV64-NEXT: xor a1, a1, a0 +; RV64-NEXT: sraiw a2, a0, 31 +; RV64-NEXT: and a1, a1, a2 +; RV64-NEXT: xor a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_abs: +; RV32: # %bb.0: +; RV32-NEXT: neg a1, a0 +; RV32-NEXT: srai a2, a0, 31 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: not a2, a2 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: ret + %neg = sub i32 0, %x + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %neg, i32 %x) + ret i32 %result +} + +; Test nabs pattern (negative abs) +define i32 @test_ctselect_nabs(i32 %x) { +; RV64-LABEL: test_ctselect_nabs: +; RV64: # %bb.0: +; RV64-NEXT: negw a1, a0 +; RV64-NEXT: xor a2, a0, a1 +; RV64-NEXT: sraiw a0, a0, 31 +; RV64-NEXT: and a0, a2, a0 +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_nabs: +; RV32: # %bb.0: +; RV32-NEXT: neg a1, a0 +; RV32-NEXT: srai a2, a0, 31 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: not a2, a2 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %neg = sub i32 0, %x + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %neg) + ret i32 %result +} + +; Test sign extension pattern +define i32 @test_ctselect_sign_extend(i32 %x) { +; RV64-LABEL: test_ctselect_sign_extend: +; RV64: # %bb.0: +; RV64-NEXT: sraiw a0, a0, 31 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_sign_extend: +; RV32: # %bb.0: +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: ret + %cmp = icmp slt i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0) + ret i32 %result +} + +; Test zero extension pattern +define i32 @test_ctselect_zero_extend(i32 %x) { +; RV64-LABEL: test_ctselect_zero_extend: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: snez a0, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_zero_extend: +; RV32: # %bb.0: +; RV32-NEXT: snez a0, a0 +; RV32-NEXT: ret + %cmp = icmp ne i32 %x, 0 + %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0) + ret i32 %result +} + +; Test constant folding with known condition +define i32 @test_ctselect_constant_folding_true(i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_constant_folding_true: +; RV64: # %bb.0: +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_constant_folding_true: +; RV32: # %bb.0: +; RV32-NEXT: ret + %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b) + ret i32 %result +} + +define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_constant_folding_false: +; RV64: # %bb.0: +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_constant_folding_false: +; RV32: # %bb.0: +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: ret + %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b) + ret i32 %result +} + +; Test with identical operands +define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) { +; RV64-LABEL: test_ctselect_identical_operands: +; RV64: # %bb.0: +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_identical_operands: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: neg a2, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a2, a2, a1 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: ret + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x) + ret i32 %result +} + +; Test with inverted condition +define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) { +; RV64-LABEL: test_ctselect_inverted_condition: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a1, a1 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: seqz a0, a0 +; RV64-NEXT: xor a2, a2, a3 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a0, a2, a0 +; RV64-NEXT: xor a0, a0, a3 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_inverted_condition: +; RV32: # %bb.0: +; RV32-NEXT: xor a0, a0, a1 +; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a2, a0, a2 +; RV32-NEXT: not a0, a0 +; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: ret + %cmp = icmp eq i32 %x, %y + %not_cmp = xor i1 %cmp, true + %result = call i32 @llvm.ct.select.i32(i1 %not_cmp, i32 %a, i32 %b) + ret i32 %result +} + +; Test chain of ct.select operations +define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, i32 %d) { +; RV64-LABEL: test_ctselect_chain: +; RV64: # %bb.0: +; RV64-NEXT: xor a3, a3, a4 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: xor a4, a4, a5 +; RV64-NEXT: slli a1, a1, 63 +; RV64-NEXT: xor a5, a5, a6 +; RV64-NEXT: slli a2, a2, 63 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: srai a1, a1, 63 +; RV64-NEXT: and a0, a3, a0 +; RV64-NEXT: xor a0, a0, a4 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: xor a0, a0, a5 +; RV64-NEXT: srai a2, a2, 63 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: xor a0, a0, a6 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_chain: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: andi a1, a1, 1 +; RV32-NEXT: andi a2, a2, 1 +; RV32-NEXT: neg a7, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a3, a7, a3 +; RV32-NEXT: neg a7, a1 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a0, a0, a4 +; RV32-NEXT: neg a4, a2 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a1, a1, a5 +; RV32-NEXT: or a0, a3, a0 +; RV32-NEXT: and a0, a7, a0 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: and a0, a4, a0 +; RV32-NEXT: and a1, a2, a6 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: ret + %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b) + %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c) + %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d) + ret i32 %sel3 +} + +; Test for 64-bit operations (supported on all 64-bit architectures) +define i64 @test_ctselect_i64_smin_zero(i64 %x) { +; RV64-LABEL: test_ctselect_i64_smin_zero: +; RV64: # %bb.0: +; RV64-NEXT: srai a1, a0, 63 +; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_ctselect_i64_smin_zero: +; RV32: # %bb.0: +; RV32-NEXT: srai a2, a1, 31 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: ret + %cmp = icmp slt i64 %x, 0 + %result = call i64 @llvm.ct.select.i64(i1 %cmp, i64 %x, i64 0) + ret i64 %result +} + +; Declare the intrinsics +declare i32 @llvm.ct.select.i32(i1, i32, i32) +declare i64 @llvm.ct.select.i64(i1, i64, i64) diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-vector-rvv.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback-vector-rvv.ll new file mode 100644 index 0000000000000..a02e1e4749443 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-vector-rvv.ll @@ -0,0 +1,804 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v -O3 | FileCheck %s --check-prefix=RV64 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v -O3 | FileCheck %s --check-prefix=RV32 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvl128b -O3 | FileCheck %s --check-prefix=RV32-V128 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvl256b -O3 | FileCheck %s --check-prefix=RV64-V256 + + +; Basic pass-through select on nxv4i32 +define @ctsel_nxv4i32_basic(i1 %cond, %a, %b) { +; RV64-LABEL: ctsel_nxv4i32_basic: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4i32_basic: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4i32_basic: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4i32_basic: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv4i32(i1 %cond, %a, %b) + ret %r +} + +; Select with loads (aligned) +define @ctsel_nxv4i32_load(i1 %cond, ptr %p1, ptr %p2) { +; RV64-LABEL: ctsel_nxv4i32_load: +; RV64: # %bb.0: +; RV64-NEXT: vl2re32.v v8, (a1) +; RV64-NEXT: vl2re32.v v10, (a2) +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4i32_load: +; RV32: # %bb.0: +; RV32-NEXT: vl2re32.v v8, (a1) +; RV32-NEXT: vl2re32.v v10, (a2) +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4i32_load: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: vl2re32.v v8, (a1) +; RV32-V128-NEXT: vl2re32.v v10, (a2) +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4i32_load: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: vl2re32.v v8, (a1) +; RV64-V256-NEXT: vl2re32.v v10, (a2) +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %a = load , ptr %p1, align 16 + %b = load , ptr %p2, align 16 + %r = call @llvm.ct.select.nxv4i32(i1 %cond, %a, %b) + ret %r +} + +; Mixed: do arithmetic first, then select, then store +define void @ctsel_nxv4i32_mixed(i1 %cond, ptr %p1, ptr %p2, ptr %out) { +; RV64-LABEL: ctsel_nxv4i32_mixed: +; RV64: # %bb.0: +; RV64-NEXT: vl2re32.v v8, (a1) +; RV64-NEXT: vl2re32.v v10, (a2) +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vadd.vv v8, v8, v8 +; RV64-NEXT: vadd.vv v10, v10, v10 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vs2r.v v8, (a3) +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4i32_mixed: +; RV32: # %bb.0: +; RV32-NEXT: vl2re32.v v8, (a1) +; RV32-NEXT: vl2re32.v v10, (a2) +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vadd.vv v8, v8, v8 +; RV32-NEXT: vadd.vv v10, v10, v10 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vs2r.v v8, (a3) +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4i32_mixed: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: vl2re32.v v8, (a1) +; RV32-V128-NEXT: vl2re32.v v10, (a2) +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vadd.vv v8, v8, v8 +; RV32-V128-NEXT: vadd.vv v10, v10, v10 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: vs2r.v v8, (a3) +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4i32_mixed: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: vl2re32.v v8, (a1) +; RV64-V256-NEXT: vl2re32.v v10, (a2) +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vadd.vv v8, v8, v8 +; RV64-V256-NEXT: vadd.vv v10, v10, v10 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: vs2r.v v8, (a3) +; RV64-V256-NEXT: ret + %a = load , ptr %p1, align 16 + %b = load , ptr %p2, align 16 + ; avoid scalable vector constants: use %a+%a and %b+%b + %a2 = add %a, %a + %b2 = add %b, %b + %r = call @llvm.ct.select.nxv4i32(i1 %cond, %a2, %b2) + store %r, ptr %out, align 16 + ret void +} + +; Const-true/false fold smoke tests +define @ctsel_nxv4i32_true( %a, %b) { +; RV64-LABEL: ctsel_nxv4i32_true: +; RV64: # %bb.0: +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4i32_true: +; RV32: # %bb.0: +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4i32_true: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4i32_true: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv4i32(i1 true, %a, %b) + ret %r +} + +define @ctsel_nxv4i32_false( %a, %b) { +; RV64-LABEL: ctsel_nxv4i32_false: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64-NEXT: vmv2r.v v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4i32_false: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV32-NEXT: vmv2r.v v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4i32_false: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV32-V128-NEXT: vmv2r.v v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4i32_false: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64-V256-NEXT: vmv2r.v v8, v10 +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv4i32(i1 false, %a, %b) + ret %r +} + +; Chain two selects to ensure masks don’t get merged away +define @ctsel_nxv4i32_chain(i1 %c1, i1 %c2, +; RV64-LABEL: ctsel_nxv4i32_chain: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; RV64-NEXT: vmv.v.i v14, 0 +; RV64-NEXT: andi a1, a1, 1 +; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v16, a0 +; RV64-NEXT: vmsne.vi v0, v16, 0 +; RV64-NEXT: vmv.v.x v18, a1 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmerge.vim v16, v14, -1, v0 +; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV64-NEXT: vmsne.vi v0, v18, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmerge.vim v14, v14, -1, v0 +; RV64-NEXT: vand.vv v8, v16, v8 +; RV64-NEXT: vnot.v v16, v16 +; RV64-NEXT: vand.vv v10, v16, v10 +; RV64-NEXT: vnot.v v16, v14 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vand.vv v8, v14, v8 +; RV64-NEXT: vand.vv v10, v16, v12 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4i32_chain: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v14, 0 +; RV32-NEXT: andi a1, a1, 1 +; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v16, a0 +; RV32-NEXT: vmsne.vi v0, v16, 0 +; RV32-NEXT: vmv.v.x v18, a1 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmerge.vim v16, v14, -1, v0 +; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32-NEXT: vmsne.vi v0, v18, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmerge.vim v14, v14, -1, v0 +; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vnot.v v16, v16 +; RV32-NEXT: vand.vv v10, v16, v10 +; RV32-NEXT: vnot.v v16, v14 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vand.vv v8, v14, v8 +; RV32-NEXT: vand.vv v10, v16, v12 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4i32_chain: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v14, 0 +; RV32-V128-NEXT: andi a1, a1, 1 +; RV32-V128-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32-V128-NEXT: vmv.v.x v16, a0 +; RV32-V128-NEXT: vmsne.vi v0, v16, 0 +; RV32-V128-NEXT: vmv.v.x v18, a1 +; RV32-V128-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmerge.vim v16, v14, -1, v0 +; RV32-V128-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32-V128-NEXT: vmsne.vi v0, v18, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmerge.vim v14, v14, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v16, v8 +; RV32-V128-NEXT: vnot.v v16, v16 +; RV32-V128-NEXT: vand.vv v10, v16, v10 +; RV32-V128-NEXT: vnot.v v16, v14 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: vand.vv v8, v14, v8 +; RV32-V128-NEXT: vand.vv v10, v16, v12 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4i32_chain: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v14, 0 +; RV64-V256-NEXT: andi a1, a1, 1 +; RV64-V256-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV64-V256-NEXT: vmv.v.x v16, a0 +; RV64-V256-NEXT: vmsne.vi v0, v16, 0 +; RV64-V256-NEXT: vmv.v.x v18, a1 +; RV64-V256-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmerge.vim v16, v14, -1, v0 +; RV64-V256-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV64-V256-NEXT: vmsne.vi v0, v18, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmerge.vim v14, v14, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v16, v8 +; RV64-V256-NEXT: vnot.v v16, v16 +; RV64-V256-NEXT: vand.vv v10, v16, v10 +; RV64-V256-NEXT: vnot.v v16, v14 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: vand.vv v8, v14, v8 +; RV64-V256-NEXT: vand.vv v10, v16, v12 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %a, + %b, + %c) { + %t = call @llvm.ct.select.nxv4i32(i1 %c1, %a, %b) + %r = call @llvm.ct.select.nxv4i32(i1 %c2, %t, %c) + ret %r +} + +; A different element width +define @ctsel_nxv8i16_basic(i1 %cond, %a, %b) { +; RV64-LABEL: ctsel_nxv8i16_basic: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv8i16_basic: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv8i16_basic: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv8i16_basic: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv8i16(i1 %cond, %a, %b) + ret %r +} + +define @ctsel_nxv16i8_basic(i1 %cond, %a, %b) { +; RV64-LABEL: ctsel_nxv16i8_basic: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv16i8_basic: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv16i8_basic: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv16i8_basic: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv16i8(i1 %cond, %a, %b) + ret %r +} + +; 64-bit elements (useful on RV64) +define @ctsel_nxv2i64_basic(i1 %cond, %a, %b) { +; RV64-LABEL: ctsel_nxv2i64_basic: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv2i64_basic: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv2i64_basic: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv2i64_basic: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv2i64(i1 %cond, %a, %b) + ret %r +} + +; Floating-point scalable vectors (bitcasted in your fallback) +define @ctsel_nxv4f32_basic(i1 %cond, %a, %b) { +; RV64-LABEL: ctsel_nxv4f32_basic: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4f32_basic: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4f32_basic: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4f32_basic: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv4f32(i1 %cond, %a, %b) + ret %r +} + +; FP arithmetic around select +define @ctsel_nxv4f32_arith(i1 %cond, %x, %y) { +; RV64-LABEL: ctsel_nxv4f32_arith: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV64-NEXT: vfadd.vv v12, v8, v10 +; RV64-NEXT: vfsub.vv v8, v8, v10 +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.x v10, a0 +; RV64-NEXT: vmsne.vi v0, v10, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: vmerge.vim v10, v10, -1, v0 +; RV64-NEXT: vand.vv v12, v10, v12 +; RV64-NEXT: vnot.v v10, v10 +; RV64-NEXT: vand.vv v8, v10, v8 +; RV64-NEXT: vor.vv v8, v12, v8 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv4f32_arith: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV32-NEXT: vfadd.vv v12, v8, v10 +; RV32-NEXT: vfsub.vv v8, v8, v10 +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: vmsne.vi v0, v10, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vmerge.vim v10, v10, -1, v0 +; RV32-NEXT: vand.vv v12, v10, v12 +; RV32-NEXT: vnot.v v10, v10 +; RV32-NEXT: vand.vv v8, v10, v8 +; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv4f32_arith: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vfadd.vv v12, v8, v10 +; RV32-V128-NEXT: vfsub.vv v8, v8, v10 +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32-V128-NEXT: vmv.v.x v10, a0 +; RV32-V128-NEXT: vmsne.vi v0, v10, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v10, 0 +; RV32-V128-NEXT: vmerge.vim v10, v10, -1, v0 +; RV32-V128-NEXT: vand.vv v12, v10, v12 +; RV32-V128-NEXT: vnot.v v10, v10 +; RV32-V128-NEXT: vand.vv v8, v10, v8 +; RV32-V128-NEXT: vor.vv v8, v12, v8 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv4f32_arith: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vfadd.vv v12, v8, v10 +; RV64-V256-NEXT: vfsub.vv v8, v8, v10 +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV64-V256-NEXT: vmv.v.x v10, a0 +; RV64-V256-NEXT: vmsne.vi v0, v10, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v10, 0 +; RV64-V256-NEXT: vmerge.vim v10, v10, -1, v0 +; RV64-V256-NEXT: vand.vv v12, v10, v12 +; RV64-V256-NEXT: vnot.v v10, v10 +; RV64-V256-NEXT: vand.vv v8, v10, v8 +; RV64-V256-NEXT: vor.vv v8, v12, v8 +; RV64-V256-NEXT: ret + %sum = fadd %x, %y + %diff = fsub %x, %y + %r = call @llvm.ct.select.nxv4f32(i1 %cond, %sum, %diff) + ret %r +} + +define @ctsel_nxv2f64_basic(i1 %cond, %a, %b) { +; RV64-LABEL: ctsel_nxv2f64_basic: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vmv.v.i v12, 0 +; RV64-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-NEXT: vand.vv v8, v12, v8 +; RV64-NEXT: vnot.v v12, v12 +; RV64-NEXT: vand.vv v10, v12, v10 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret +; +; RV32-LABEL: ctsel_nxv2f64_basic: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vmv.v.i v12, 0 +; RV32-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-NEXT: vand.vv v8, v12, v8 +; RV32-NEXT: vnot.v v12, v12 +; RV32-NEXT: vand.vv v10, v12, v10 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV32-V128-LABEL: ctsel_nxv2f64_basic: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: andi a0, a0, 1 +; RV32-V128-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32-V128-NEXT: vmv.v.x v12, a0 +; RV32-V128-NEXT: vmsne.vi v0, v12, 0 +; RV32-V128-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-V128-NEXT: vmv.v.i v12, 0 +; RV32-V128-NEXT: vmerge.vim v12, v12, -1, v0 +; RV32-V128-NEXT: vand.vv v8, v12, v8 +; RV32-V128-NEXT: vnot.v v12, v12 +; RV32-V128-NEXT: vand.vv v10, v12, v10 +; RV32-V128-NEXT: vor.vv v8, v8, v10 +; RV32-V128-NEXT: ret +; +; RV64-V256-LABEL: ctsel_nxv2f64_basic: +; RV64-V256: # %bb.0: +; RV64-V256-NEXT: andi a0, a0, 1 +; RV64-V256-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64-V256-NEXT: vmv.v.x v12, a0 +; RV64-V256-NEXT: vmsne.vi v0, v12, 0 +; RV64-V256-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-V256-NEXT: vmv.v.i v12, 0 +; RV64-V256-NEXT: vmerge.vim v12, v12, -1, v0 +; RV64-V256-NEXT: vand.vv v8, v12, v8 +; RV64-V256-NEXT: vnot.v v12, v12 +; RV64-V256-NEXT: vand.vv v10, v12, v10 +; RV64-V256-NEXT: vor.vv v8, v8, v10 +; RV64-V256-NEXT: ret + %r = call @llvm.ct.select.nxv2f64(i1 %cond, %a, %b) + ret %r +} + +declare @llvm.ct.select.nxv4i32(i1, , ) +declare @llvm.ct.select.nxv8i16(i1, , ) +declare @llvm.ct.select.nxv16i8(i1, , ) +declare @llvm.ct.select.nxv2i64(i1, , ) +declare @llvm.ct.select.nxv4f32(i1, , ) +declare @llvm.ct.select.nxv2f64(i1, ,) diff --git a/llvm/test/CodeGen/RISCV/ctselect-side-effects.ll b/llvm/test/CodeGen/RISCV/ctselect-side-effects.ll new file mode 100644 index 0000000000000..255d575ca8f9f --- /dev/null +++ b/llvm/test/CodeGen/RISCV/ctselect-side-effects.ll @@ -0,0 +1,176 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv64 -O3 -filetype=asm | FileCheck %s --check-prefix=RV64 +; RUN: llc < %s -mtriple=riscv32 -O3 -filetype=asm | FileCheck %s --check-prefix=RV32 + +; Test 1: Basic optimizations should still work +define i32 @test_basic_opts(i32 %x) { +; RV64-LABEL: test_basic_opts: +; RV64: # %bb.0: +; RV64-NEXT: ret +; +; RV32-LABEL: test_basic_opts: +; RV32: # %bb.0: +; RV32-NEXT: ret + %a = or i32 %x, 0 ; Should eliminate + %b = and i32 %a, -1 ; Should eliminate + %c = xor i32 %b, 0 ; Should eliminate + ret i32 %c +} + +; Test 2: Constant folding should work +define i32 @test_constant_fold() { +; RV64-LABEL: test_constant_fold: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_constant_fold: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 0 +; RV32-NEXT: ret + %a = xor i32 -1, -1 ; Should fold to 0 + ret i32 %a +} + +; Test 3: Protected pattern should NOT have branches +define i32 @test_protected_no_branch(i1 %cond, i32 %a, i32 %b) { +; RV64-LABEL: test_protected_no_branch: +; RV64: # %bb.0: +; RV64-NEXT: xor a1, a1, a2 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: xor a0, a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_protected_no_branch: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: neg a3, a0 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a1, a3, a1 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: ret + %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b) + ret i32 %result +} + +; Test 4: Explicit branch should still generate branches +define i32 @test_explicit_branch(i1 %cond, i32 %a, i32 %b) { +; RV64-LABEL: test_explicit_branch: +; RV64: # %bb.0: +; RV64-NEXT: andi a0, a0, 1 +; RV64-NEXT: beqz a0, .LBB3_2 +; RV64-NEXT: # %bb.1: # %true +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: ret +; RV64-NEXT: .LBB3_2: # %false +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret +; +; RV32-LABEL: test_explicit_branch: +; RV32: # %bb.0: +; RV32-NEXT: andi a0, a0, 1 +; RV32-NEXT: beqz a0, .LBB3_2 +; RV32-NEXT: # %bb.1: # %true +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: ret +; RV32-NEXT: .LBB3_2: # %false +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: ret + br i1 %cond, label %true, label %false +true: + ret i32 %a +false: + ret i32 %b +} + +; Test 5: Regular select (not ct.select) - whatever wasm wants to do +define i32 @test_regular_select(i1 %cond, i32 %a, i32 %b) { +; RV64-LABEL: test_regular_select: +; RV64: # %bb.0: +; RV64-NEXT: andi a3, a0, 1 +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: bnez a3, .LBB4_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: .LBB4_2: +; RV64-NEXT: ret +; +; RV32-LABEL: test_regular_select: +; RV32: # %bb.0: +; RV32-NEXT: andi a3, a0, 1 +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: bnez a3, .LBB4_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: .LBB4_2: +; RV32-NEXT: ret + %result = select i1 %cond, i32 %a, i32 %b + ret i32 %result +} + +; Test if XOR with all-ones still gets optimized +define i32 @test_xor_all_ones() { +; RV64-LABEL: test_xor_all_ones: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_xor_all_ones: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 0 +; RV32-NEXT: ret + %xor1 = xor i32 -1, -1 ; Should optimize to 0 + ret i32 %xor1 +} + +define i32 @test_xor_same_value(i32 %x) { +; RV64-LABEL: test_xor_same_value: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_xor_same_value: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 0 +; RV32-NEXT: ret + %xor2 = xor i32 %x, %x ; Should optimize to 0 + ret i32 %xor2 +} + +define i32 @test_normal_ops(i32 %x) { +; RV64-LABEL: test_normal_ops: +; RV64: # %bb.0: +; RV64-NEXT: ret +; +; RV32-LABEL: test_normal_ops: +; RV32: # %bb.0: +; RV32-NEXT: ret + %or1 = or i32 %x, 0 ; Should optimize to %x + %and1 = and i32 %or1, -1 ; Should optimize to %x + %xor1 = xor i32 %and1, 0 ; Should optimize to %x + ret i32 %xor1 +} + +; This simulates what the reviewer is worried about +define i32 @test_xor_with_const_operands() { +; RV64-LABEL: test_xor_with_const_operands: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; +; RV32-LABEL: test_xor_with_const_operands: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 0 +; RV32-NEXT: ret + %a = xor i32 -1, -1 ; -1 ^ -1 should become 0 + %b = xor i32 0, 0 ; 0 ^ 0 should become 0 + %c = xor i32 42, 42 ; 42 ^ 42 should become 0 + %result = or i32 %a, %b + %final = or i32 %result, %c + ret i32 %final ; Should optimize to 0 +} + +declare i32 @llvm.ct.select.i32(i1, i32, i32)