diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 6072fd9d8f242..d2c377b24ca2e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -26965,6 +26965,10 @@ static SDValue performSelectCombine(SDNode *N, if (!ResVT.isVector() || NumMaskElts == 0) return SDValue(); + // Avoid creating vectors with excessive VFs before legalization. + if (DCI.isBeforeLegalize()) + NumMaskElts = ResVT.getVectorNumElements(); + SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts); EVT CCVT = SrcVT.changeVectorElementTypeToInteger(); diff --git a/llvm/test/CodeGen/AArch64/expand-select.ll b/llvm/test/CodeGen/AArch64/expand-select.ll index 1ca4719d9b6bf..8ad9ea3b7a8d5 100644 --- a/llvm/test/CodeGen/AArch64/expand-select.ll +++ b/llvm/test/CodeGen/AArch64/expand-select.ll @@ -4,20 +4,15 @@ define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) { ; CHECK-LABEL: foo: ; CHECK: // %bb.0: -; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: and w8, w0, #0x1 -; CHECK-NEXT: ldr x11, [sp] -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: ldp x8, x10, [sp, #8] -; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: tst w9, #0x1 -; CHECK-NEXT: csel x8, x5, x8, ne -; CHECK-NEXT: csel x9, x4, x11, ne -; CHECK-NEXT: stp x9, x8, [x10, #16] -; CHECK-NEXT: csel x8, x3, x7, ne -; CHECK-NEXT: csel x9, x2, x6, ne -; CHECK-NEXT: stp x9, x8, [x10] +; CHECK-NEXT: ldp x8, x9, [sp, #8] +; CHECK-NEXT: tst w0, #0x1 +; CHECK-NEXT: ldr x10, [sp] +; CHECK-NEXT: csel x8, x5, x8, eq +; CHECK-NEXT: csel x10, x4, x10, eq +; CHECK-NEXT: stp x10, x8, [x9, #16] +; CHECK-NEXT: csel x8, x3, x7, eq +; CHECK-NEXT: csel x10, x2, x6, eq +; CHECK-NEXT: stp x10, x8, [x9] ; CHECK-NEXT: ret %cond = and i32 %In1, 1 %cbool = icmp eq i32 %cond, 0 @@ -31,22 +26,17 @@ define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) { define void @bar(i32 %In1, <2 x i96> %In2, <2 x i96> %In3, ptr %Out) { ; CHECK-LABEL: bar: ; CHECK: // %bb.0: -; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: and w8, w0, #0x1 -; CHECK-NEXT: ldr x10, [sp, #16] -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: tst w9, #0x1 -; CHECK-NEXT: ldp x8, x9, [sp] -; CHECK-NEXT: csel x11, x2, x6, ne -; CHECK-NEXT: str x11, [x10] -; CHECK-NEXT: csel x8, x4, x8, ne -; CHECK-NEXT: stur x8, [x10, #12] -; CHECK-NEXT: csel x8, x5, x9, ne -; CHECK-NEXT: csel x9, x3, x7, ne -; CHECK-NEXT: str w8, [x10, #20] -; CHECK-NEXT: str w9, [x10, #8] +; CHECK-NEXT: ldp x8, x10, [sp] +; CHECK-NEXT: tst w0, #0x1 +; CHECK-NEXT: ldr x9, [sp, #16] +; CHECK-NEXT: csel x11, x2, x6, eq +; CHECK-NEXT: csel x8, x4, x8, eq +; CHECK-NEXT: str x11, [x9] +; CHECK-NEXT: stur x8, [x9, #12] +; CHECK-NEXT: csel x8, x5, x10, eq +; CHECK-NEXT: csel x10, x3, x7, eq +; CHECK-NEXT: str w8, [x9, #20] +; CHECK-NEXT: str w10, [x9, #8] ; CHECK-NEXT: ret %cond = and i32 %In1, 1 %cbool = icmp eq i32 %cond, 0 diff --git a/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll new file mode 100644 index 0000000000000..dedd4323f1519 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll @@ -0,0 +1,67 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc %s -o - | FileCheck %s +target triple = "aarch64-linux-gnu" + +;; An 'AnyOf' reduction (vector.reduce.or) is instcombined to a bitcast to an +;; integer of a bitwidth equal to the number of lanes being reduced, then +;; compared against zero. To select between vectors for NEON, we then need to +;; broadcast the result, but we must be careful when the bitwidth of the scalar +;; result is smaller than the element size of the vectors being selected. We +;; don't want to end up with scalarization. + +define <4 x i32> @any_of_select_vf4(<4 x i32> %mask, <4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: any_of_select_vf4: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-NEXT: umaxv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: dup v0.4s, w8 +; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b +; CHECK-NEXT: ret + %cmp = icmp slt <4 x i32> %mask, zeroinitializer + %cmp.bc = bitcast <4 x i1> %cmp to i4 + %cmp.bc.not = icmp eq i4 %cmp.bc, 0 + %res = select i1 %cmp.bc.not, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %res +} + +define <2 x i64> @any_of_select_vf2(<2 x i64> %mask, <2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: any_of_select_vf2: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.2d, v0.2d, #0 +; CHECK-NEXT: umaxv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csetm x8, ne +; CHECK-NEXT: dup v0.2d, x8 +; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b +; CHECK-NEXT: ret + %cmp = icmp slt <2 x i64> %mask, zeroinitializer + %cmp.bc = bitcast <2 x i1> %cmp to i2 + %cmp.bc.not = icmp eq i2 %cmp.bc, 0 + %res = select i1 %cmp.bc.not, <2 x i64> %a, <2 x i64> %b + ret <2 x i64> %res +} + +define <32 x i8> @any_of_select_vf32(<32 x i8> %mask, <32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: any_of_select_vf32: +; CHECK: // %bb.0: +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: umaxv b0, v0.16b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: dup v1.16b, w8 +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: bsl v1.16b, v5.16b, v3.16b +; CHECK-NEXT: bsl v0.16b, v4.16b, v2.16b +; CHECK-NEXT: ret + %cmp = icmp slt <32 x i8> %mask, zeroinitializer + %cmp.bc = bitcast <32 x i1> %cmp to i32 + %cmp.bc.not = icmp eq i32 %cmp.bc, 0 + %res = select i1 %cmp.bc.not, <32 x i8> %a, <32 x i8> %b + ret <32 x i8> %res +}