diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 5ffaf2c49b4c0..03d1f7e885829 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -18995,6 +18995,18 @@ static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) { else if (SDValue R = performUADDVZextCombine(A, DAG)) return R; } + + // uaddv(A) --> A if all lanes of A are known to be zeros except the 0th lane. + MVT VT = N->getSimpleValueType(0); + MVT OpVT = A.getSimpleValueType(); + assert(VT == OpVT && + "The operand type should be consistent with the result type of UADDV"); + APInt Mask = APInt::getAllOnes(OpVT.getVectorNumElements()); + Mask.clearBit(0); + KnownBits KnownLeadingLanes = DAG.computeKnownBits(A, Mask); + if (KnownLeadingLanes.isZero()) + return A; + return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll index bc675343adc08..d9180a28bd40b 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll @@ -493,3 +493,154 @@ entry: ret i128 %arg1 } +define i16 @addv_zero_lanes_v4i16(ptr %arr) { +; CHECK-SD-LABEL: addv_zero_lanes_v4i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldrb w0, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: addv_zero_lanes_v4i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldrb w8, [x0] +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: addv h0, v0.4h +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret + %v = load i64, ptr %arr + %and = and i64 %v, 255 + %vec = bitcast i64 %and to <4 x i16> + %r = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %vec) + ret i16 %r +} + +define i8 @addv_zero_lanes_v8i8(ptr %arr) { +; CHECK-SD-LABEL: addv_zero_lanes_v8i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldrb w0, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: addv_zero_lanes_v8i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldrb w8, [x0] +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: addv b0, v0.8b +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret + %v = load i64, ptr %arr + %and = and i64 %v, 255 + %vec = bitcast i64 %and to <8 x i8> + %r = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %vec) + ret i8 %r +} + +define i8 @addv_zero_lanes_negative_v8i8(ptr %arr) { +; CHECK-LABEL: addv_zero_lanes_negative_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: and x8, x8, #0x100 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: addv b0, v0.8b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %v = load i64, ptr %arr + %and = and i64 %v, 256 + %vec = bitcast i64 %and to <8 x i8> + %r = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %vec) + ret i8 %r +} + + +define i8 @addv_zero_lanes_v16i8(ptr %arr) { +; CHECK-SD-LABEL: addv_zero_lanes_v16i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: ldrb w8, [x0] +; CHECK-SD-NEXT: mov v0.d[0], x8 +; CHECK-SD-NEXT: addv b0, v0.16b +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: addv_zero_lanes_v16i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldrb w8, [x0] +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: mov v0.d[1], xzr +; CHECK-GI-NEXT: addv b0, v0.16b +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret + %v = load i128, ptr %arr + %and = and i128 %v, 255 + %vec = bitcast i128 %and to <16 x i8> + %r = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %vec) + ret i8 %r +} + +define i16 @addv_zero_lanes_v8i16(ptr %arr) { +; CHECK-SD-LABEL: addv_zero_lanes_v8i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: ldrh w8, [x0] +; CHECK-SD-NEXT: mov v0.d[0], x8 +; CHECK-SD-NEXT: addv h0, v0.8h +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: addv_zero_lanes_v8i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldrh w8, [x0] +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: mov v0.d[1], xzr +; CHECK-GI-NEXT: addv h0, v0.8h +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret + %v = load i128, ptr %arr + %and = and i128 %v, u0xFFFF + %vec = bitcast i128 %and to <8 x i16> + %r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %vec) + ret i16 %r +} + +define i32 @addv_zero_lanes_v4i32(ptr %arr) { +; CHECK-SD-LABEL: addv_zero_lanes_v4i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: ldr w8, [x0] +; CHECK-SD-NEXT: mov v0.d[0], x8 +; CHECK-SD-NEXT: addv s0, v0.4s +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: addv_zero_lanes_v4i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr w8, [x0] +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: mov v0.d[1], xzr +; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret + %v = load i128, ptr %arr + %and = and i128 %v, u0xFFFFFFFF + %vec = bitcast i128 %and to <4 x i32> + %r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %vec) + ret i32 %r +} + +define i32 @addv_zero_lanes_v2i32(ptr %arr) { +; CHECK-SD-LABEL: addv_zero_lanes_v2i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr w0, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: addv_zero_lanes_v2i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldr w8, [x0] +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: addp v0.2s, v0.2s, v0.2s +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret + %v = load i64, ptr %arr + %and = and i64 %v, u0xFFFFFFFF + %vec = bitcast i64 %and to <2 x i32> + %r = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %vec) + ret i32 %r +} diff --git a/llvm/test/CodeGen/AArch64/abds.ll b/llvm/test/CodeGen/AArch64/abds.ll index bf52e71ec21fe..c42ef1a96e5a3 100644 --- a/llvm/test/CodeGen/AArch64/abds.ll +++ b/llvm/test/CodeGen/AArch64/abds.ll @@ -497,13 +497,9 @@ define i32 @abd_sub_i32(i32 %a, i32 %b) nounwind { define i64 @vector_legalized(i16 %a, i16 %b) { ; CHECK-LABEL: vector_legalized: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: sxth w8, w0 ; CHECK-NEXT: subs w8, w8, w1, sxth -; CHECK-NEXT: addp d0, v0.2d -; CHECK-NEXT: cneg w8, w8, mi -; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: add x0, x9, x8 +; CHECK-NEXT: cneg w0, w8, mi ; CHECK-NEXT: ret %ea = sext i16 %a to i32 %eb = sext i16 %b to i32 diff --git a/llvm/test/CodeGen/AArch64/abdu.ll b/llvm/test/CodeGen/AArch64/abdu.ll index 8d2b0b0742d7d..9fbcc1c82017f 100644 --- a/llvm/test/CodeGen/AArch64/abdu.ll +++ b/llvm/test/CodeGen/AArch64/abdu.ll @@ -362,13 +362,9 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { define i64 @vector_legalized(i16 %a, i16 %b) { ; CHECK-LABEL: vector_legalized: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: and w8, w0, #0xffff ; CHECK-NEXT: subs w8, w8, w1, uxth -; CHECK-NEXT: cneg w8, w8, mi -; CHECK-NEXT: addp d0, v0.2d -; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: add x0, x9, x8 +; CHECK-NEXT: cneg w0, w8, mi ; CHECK-NEXT: ret %ea = zext i16 %a to i32 %eb = zext i16 %b to i32 diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll index c739be95cd243..d547b6bec5b83 100644 --- a/llvm/test/CodeGen/AArch64/ctpop.ll +++ b/llvm/test/CodeGen/AArch64/ctpop.ll @@ -505,3 +505,125 @@ entry: %s = call <4 x i128> @llvm.ctpop(<4 x i128> %d) ret <4 x i128> %s } + +define i8 @i8(i8 %x) { +; CHECK-SD-LABEL: i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: and w8, w0, #0xff +; CHECK-SD-NEXT: fmov s0, w8 +; CHECK-SD-NEXT: cnt v0.8b, v0.8b +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-GI-NEXT: and x8, x0, #0xff +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: cnt v0.8b, v0.8b +; CHECK-GI-NEXT: uaddlv h0, v0.8b +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret +entry: + %s = call i8 @llvm.ctpop.i8(i8 %x) + ret i8 %s +} + +define i16 @i16_mask(i16 %x) { +; CHECK-SD-LABEL: i16_mask: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: and w8, w0, #0xff +; CHECK-SD-NEXT: fmov s0, w8 +; CHECK-SD-NEXT: cnt v0.8b, v0.8b +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: i16_mask: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: and w8, w0, #0xff +; CHECK-GI-NEXT: and x8, x8, #0xffff +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: cnt v0.8b, v0.8b +; CHECK-GI-NEXT: uaddlv h0, v0.8b +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret +entry: + %and = and i16 %x, 255 + %s = call i16 @llvm.ctpop.i16(i16 %and) + ret i16 %s +} + +define i32 @i32_mask(i32 %x) { +; CHECK-SD-LABEL: i32_mask: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: and w8, w0, #0xff +; CHECK-SD-NEXT: fmov s0, w8 +; CHECK-SD-NEXT: cnt v0.8b, v0.8b +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: i32_mask: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: and w8, w0, #0xff +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: cnt v0.8b, v0.8b +; CHECK-GI-NEXT: uaddlv h0, v0.8b +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret +entry: + %and = and i32 %x, 255 + %s = call i32 @llvm.ctpop.i32(i32 %and) + ret i32 %s +} + +define i32 @i32_mask_negative(i32 %x) { +; CHECK-SD-LABEL: i32_mask_negative: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: and w8, w0, #0xffff +; CHECK-SD-NEXT: fmov s0, w8 +; CHECK-SD-NEXT: cnt v0.8b, v0.8b +; CHECK-SD-NEXT: addv b0, v0.8b +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: i32_mask_negative: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: and w8, w0, #0xffff +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: cnt v0.8b, v0.8b +; CHECK-GI-NEXT: uaddlv h0, v0.8b +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret +entry: + %and = and i32 %x, 65535 + %s = call i32 @llvm.ctpop.i32(i32 %and) + ret i32 %s +} + +define i128 @i128_mask(i128 %x) { +; CHECK-SD-LABEL: i128_mask: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: and x8, x0, #0xff +; CHECK-SD-NEXT: mov x1, xzr +; CHECK-SD-NEXT: mov v0.d[0], x8 +; CHECK-SD-NEXT: cnt v0.16b, v0.16b +; CHECK-SD-NEXT: addv b0, v0.16b +; CHECK-SD-NEXT: fmov x0, d0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: i128_mask: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: and x8, x0, #0xff +; CHECK-GI-NEXT: mov x1, xzr +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: mov v0.d[1], xzr +; CHECK-GI-NEXT: cnt v0.16b, v0.16b +; CHECK-GI-NEXT: uaddlv h0, v0.16b +; CHECK-GI-NEXT: mov w0, v0.s[0] +; CHECK-GI-NEXT: ret +entry: + %and = and i128 %x, 255 + %s = call i128 @llvm.ctpop.i128(i128 %and) + ret i128 %s +}