diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 710ca7fcba756..9251d4e08d397 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1507,14 +1507,26 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v16i8, Custom); setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom); + // v2i64/v16i8 SUMLA always reduces to v4i32 SUMLA via + // LowerPARTIAL_REDUCE_MLA, regardless of i8mm; v2i32/v16i8 SUMLA goes + // through the same widen-to-v4i32 path as the SMLA/UMLA cases above. + setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v2i64, + MVT::v16i8, Custom); + setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v2i32, + MVT::v16i8, Custom); + if (Subtarget->hasMatMulInt8()) { setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v4i32, MVT::v16i8, Legal); - setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v2i64, - MVT::v16i8, Custom); - setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v2i32, MVT::v8i8, Legal); + } else { + // Native dotprod without i8mm: lower SUMLA to two UDOT products in + // LowerPARTIAL_REDUCE_MLA. + setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v4i32, + MVT::v16i8, Custom); + setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v2i32, + MVT::v8i8, Custom); } } @@ -33115,14 +33127,12 @@ SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op, return Scatter; } -/// If a PARTIAL_REDUCE_MLA node comes in with an accumulator-input type pairing -/// of (nx)v2i64/(nx)v16i8, we cannot directly lower it to a (u|s)dot. We can -/// however still make use of the dot product instruction by instead -/// accumulating over two steps: (nx)v16i8 -> (nx)v4i32 -> (nx)v2i64. -/// If available, make use of the (U|S)ADDW(B|T) instructions, otherwise -/// the following pattern is emitted: -/// add(add(Acc, ext(EXTRACT_SUBVECTOR(N, 0)), ext(EXTRACT_SUBVECTOR(N, -/// NTy/2)))) +/// Lower a PARTIAL_REDUCE_MLA node. Three cases are handled: +/// 1. (v2i32, v16i8): widen Acc to v4i32 and fold the high half with ADDP. +/// 2. (nx)v2i64/(nx)v16i8: accumulate in two steps via v4i32, using +/// (U|S)ADDW(B|T) when available, otherwise add(add(Acc, ext(lo), ext(hi))). +/// 3. SUMLA on (v4i32, v16i8) or (v2i32, v8i8) without +i8mm: rewrite as two +/// UDOTs using the bias-128 identity sext(s) = zext(s ^ 128) - 128. SDValue AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op, SelectionDAG &DAG) const { @@ -33146,6 +33156,28 @@ AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op, return DAG.getExtractSubvector(DL, MVT::v2i32, Reduced, 0); } + // Lower PARTIAL_REDUCE_SUMLA on targets without +i8mm using udot via + // sum(sext(LHS) * zext(RHS)) = + // sum(zext(LHS ^ 128) * zext(RHS)) - sum(128 * zext(RHS)) + // using sext(s) = zext(s ^ 128) - 128, where XOR with 128 flips the + // sign bit of each i8 lane, mapping signed values to their unsigned + // bias-128 representation. + // The (v2i64, v16i8) case is handled by the v4i32 reduction below, which + // recursively re-enters this path. + if (Op.getOpcode() == ISD::PARTIAL_REDUCE_SUMLA && + !Subtarget->hasMatMulInt8() && Subtarget->hasDotProd() && + ((ResultVT == MVT::v4i32 && OpVT == MVT::v16i8) || + (ResultVT == MVT::v2i32 && OpVT == MVT::v8i8))) { + SDValue SignFlipMask = DAG.getConstant(128, DL, OpVT); + SDValue BiasedLHS = DAG.getNode(ISD::XOR, DL, OpVT, LHS, SignFlipMask); + SDValue BiasedDot = DAG.getNode(ISD::PARTIAL_REDUCE_UMLA, DL, ResultVT, Acc, + BiasedLHS, RHS); + SDValue BiasCorrection = + DAG.getNode(ISD::PARTIAL_REDUCE_UMLA, DL, ResultVT, + DAG.getConstant(0, DL, ResultVT), SignFlipMask, RHS); + return DAG.getNode(ISD::SUB, DL, ResultVT, BiasedDot, BiasCorrection); + } + bool ConvertToScalable = ResultVT.isFixedLengthVector() && useSVEForFixedLengthVectorVT(ResultVT, /*OverrideNEON=*/true); diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll index 709d3d387d3a1..b5801f8f48057 100644 --- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll @@ -211,14 +211,12 @@ define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) { ; ; CHECK-DOT-LABEL: usdot: ; CHECK-DOT: // %bb.0: -; CHECK-DOT-NEXT: ushll v3.8h, v1.8b, #0 -; CHECK-DOT-NEXT: sshll v4.8h, v2.8b, #0 -; CHECK-DOT-NEXT: ushll2 v1.8h, v1.16b, #0 -; CHECK-DOT-NEXT: sshll2 v2.8h, v2.16b, #0 -; CHECK-DOT-NEXT: smlal v0.4s, v4.4h, v3.4h -; CHECK-DOT-NEXT: smlal2 v0.4s, v4.8h, v3.8h -; CHECK-DOT-NEXT: smlal v0.4s, v2.4h, v1.4h -; CHECK-DOT-NEXT: smlal2 v0.4s, v2.8h, v1.8h +; CHECK-DOT-NEXT: movi v3.16b, #128 +; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000 +; CHECK-DOT-NEXT: eor v2.16b, v2.16b, v3.16b +; CHECK-DOT-NEXT: udot v4.4s, v3.16b, v1.16b +; CHECK-DOT-NEXT: udot v0.4s, v2.16b, v1.16b +; CHECK-DOT-NEXT: sub v0.4s, v0.4s, v4.4s ; CHECK-DOT-NEXT: ret ; ; CHECK-DOT-I8MM-LABEL: usdot: @@ -258,23 +256,22 @@ define <4 x i32> @usdot_in_loop(ptr %p1, ptr %p2){ ; ; CHECK-DOT-LABEL: usdot_in_loop: ; CHECK-DOT: // %bb.0: // %entry -; CHECK-DOT-NEXT: movi v1.2d, #0000000000000000 +; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000 +; CHECK-DOT-NEXT: movi v1.16b, #128 ; CHECK-DOT-NEXT: mov x8, xzr ; CHECK-DOT-NEXT: .LBB6_1: // %vector.body ; CHECK-DOT-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-DOT-NEXT: ldr q2, [x0, x8] -; CHECK-DOT-NEXT: ldr q3, [x1, x8] -; CHECK-DOT-NEXT: mov v0.16b, v1.16b +; CHECK-DOT-NEXT: ldr q0, [x0, x8] +; CHECK-DOT-NEXT: movi v3.2d, #0000000000000000 +; CHECK-DOT-NEXT: eor v4.16b, v0.16b, v1.16b +; CHECK-DOT-NEXT: mov v0.16b, v2.16b +; CHECK-DOT-NEXT: ldr q2, [x1, x8] ; CHECK-DOT-NEXT: add x8, x8, #16 -; CHECK-DOT-NEXT: sshll v4.8h, v2.8b, #0 -; CHECK-DOT-NEXT: ushll v5.8h, v3.8b, #0 -; CHECK-DOT-NEXT: sshll2 v2.8h, v2.16b, #0 -; CHECK-DOT-NEXT: ushll2 v3.8h, v3.16b, #0 +; CHECK-DOT-NEXT: udot v3.4s, v1.16b, v2.16b ; CHECK-DOT-NEXT: cmp x8, #16 -; CHECK-DOT-NEXT: smlal v1.4s, v4.4h, v5.4h -; CHECK-DOT-NEXT: smlal2 v1.4s, v4.8h, v5.8h -; CHECK-DOT-NEXT: smlal v1.4s, v2.4h, v3.4h -; CHECK-DOT-NEXT: smlal2 v1.4s, v2.8h, v3.8h +; CHECK-DOT-NEXT: mov v5.16b, v0.16b +; CHECK-DOT-NEXT: udot v5.4s, v4.16b, v2.16b +; CHECK-DOT-NEXT: sub v2.4s, v5.4s, v3.4s ; CHECK-DOT-NEXT: b.ne .LBB6_1 ; CHECK-DOT-NEXT: // %bb.2: // %end ; CHECK-DOT-NEXT: ret @@ -316,6 +313,92 @@ end: ret <4 x i32> %acc } +; Same as @usdot_in_loop, but with a non-zero initial accumulator carried +; across iterations. Validates that the SUMLA -> 2x udot identity composes +; correctly when the accumulator is not zero on entry. +define <4 x i32> @usdot_in_loop_nonzero_acc(ptr %p1, ptr %p2, <4 x i32> %init){ +; CHECK-NODOT-LABEL: usdot_in_loop_nonzero_acc: +; CHECK-NODOT: // %bb.0: // %entry +; CHECK-NODOT-NEXT: mov x8, xzr +; CHECK-NODOT-NEXT: .LBB7_1: // %vector.body +; CHECK-NODOT-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NODOT-NEXT: ldr q2, [x0, x8] +; CHECK-NODOT-NEXT: ldr q3, [x1, x8] +; CHECK-NODOT-NEXT: mov v1.16b, v0.16b +; CHECK-NODOT-NEXT: add x8, x8, #16 +; CHECK-NODOT-NEXT: sshll v4.8h, v2.8b, #0 +; CHECK-NODOT-NEXT: ushll v5.8h, v3.8b, #0 +; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-NODOT-NEXT: ushll2 v3.8h, v3.16b, #0 +; CHECK-NODOT-NEXT: cmp x8, #16 +; CHECK-NODOT-NEXT: smlal v0.4s, v4.4h, v5.4h +; CHECK-NODOT-NEXT: smlal2 v0.4s, v4.8h, v5.8h +; CHECK-NODOT-NEXT: smlal v0.4s, v2.4h, v3.4h +; CHECK-NODOT-NEXT: smlal2 v0.4s, v2.8h, v3.8h +; CHECK-NODOT-NEXT: b.ne .LBB7_1 +; CHECK-NODOT-NEXT: // %bb.2: // %end +; CHECK-NODOT-NEXT: mov v0.16b, v1.16b +; CHECK-NODOT-NEXT: ret +; +; CHECK-DOT-LABEL: usdot_in_loop_nonzero_acc: +; CHECK-DOT: // %bb.0: // %entry +; CHECK-DOT-NEXT: movi v2.16b, #128 +; CHECK-DOT-NEXT: mov x8, xzr +; CHECK-DOT-NEXT: .LBB7_1: // %vector.body +; CHECK-DOT-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-DOT-NEXT: ldr q1, [x0, x8] +; CHECK-DOT-NEXT: movi v3.2d, #0000000000000000 +; CHECK-DOT-NEXT: eor v4.16b, v1.16b, v2.16b +; CHECK-DOT-NEXT: mov v1.16b, v0.16b +; CHECK-DOT-NEXT: ldr q0, [x1, x8] +; CHECK-DOT-NEXT: add x8, x8, #16 +; CHECK-DOT-NEXT: udot v3.4s, v2.16b, v0.16b +; CHECK-DOT-NEXT: cmp x8, #16 +; CHECK-DOT-NEXT: mov v5.16b, v1.16b +; CHECK-DOT-NEXT: udot v5.4s, v4.16b, v0.16b +; CHECK-DOT-NEXT: sub v0.4s, v5.4s, v3.4s +; CHECK-DOT-NEXT: b.ne .LBB7_1 +; CHECK-DOT-NEXT: // %bb.2: // %end +; CHECK-DOT-NEXT: mov v0.16b, v1.16b +; CHECK-DOT-NEXT: ret +; +; CHECK-DOT-I8MM-LABEL: usdot_in_loop_nonzero_acc: +; CHECK-DOT-I8MM: // %bb.0: // %entry +; CHECK-DOT-I8MM-NEXT: mov x8, xzr +; CHECK-DOT-I8MM-NEXT: .LBB7_1: // %vector.body +; CHECK-DOT-I8MM-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-DOT-I8MM-NEXT: ldr q2, [x0, x8] +; CHECK-DOT-I8MM-NEXT: ldr q3, [x1, x8] +; CHECK-DOT-I8MM-NEXT: mov v1.16b, v0.16b +; CHECK-DOT-I8MM-NEXT: add x8, x8, #16 +; CHECK-DOT-I8MM-NEXT: usdot v0.4s, v3.16b, v2.16b +; CHECK-DOT-I8MM-NEXT: cmp x8, #16 +; CHECK-DOT-I8MM-NEXT: b.ne .LBB7_1 +; CHECK-DOT-I8MM-NEXT: // %bb.2: // %end +; CHECK-DOT-I8MM-NEXT: mov v0.16b, v1.16b +; CHECK-DOT-I8MM-NEXT: ret +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %acc = phi <4 x i32> [ %init, %entry ], [ %partial.reduce, %vector.body ] + %gep1 = getelementptr i8, ptr %p1, i64 %index + %load1 = load <16 x i8>, ptr %gep1, align 16 + %load1.wide = sext <16 x i8> %load1 to <16 x i32> + %gep2 = getelementptr i8, ptr %p2, i64 %index + %load2 = load <16 x i8>, ptr %gep2, align 16 + %load2.wide = zext <16 x i8> %load2 to <16 x i32> + %mul = mul nuw nsw <16 x i32> %load1.wide, %load2.wide + %partial.reduce = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mul) + %index.next = add nuw i64 %index, 16 + %cmp = icmp eq i64 %index.next, 16 + br i1 %cmp, label %end, label %vector.body + +end: + ret <4 x i32> %acc +} + define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{ ; CHECK-NODOT-LABEL: usdot_narrow: ; CHECK-NODOT: // %bb.0: @@ -336,19 +419,12 @@ define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{ ; ; CHECK-DOT-LABEL: usdot_narrow: ; CHECK-DOT: // %bb.0: -; CHECK-DOT-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-DOT-NEXT: sshll v2.8h, v2.8b, #0 -; CHECK-DOT-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-DOT-NEXT: smull v3.4s, v2.4h, v1.4h -; CHECK-DOT-NEXT: smlal v0.4s, v2.4h, v1.4h -; CHECK-DOT-NEXT: mov d4, v1.d[1] -; CHECK-DOT-NEXT: mov d5, v2.d[1] -; CHECK-DOT-NEXT: smull2 v1.4s, v2.8h, v1.8h -; CHECK-DOT-NEXT: mov d3, v3.d[1] -; CHECK-DOT-NEXT: mov d1, v1.d[1] -; CHECK-DOT-NEXT: add v0.2s, v3.2s, v0.2s -; CHECK-DOT-NEXT: smlal v0.4s, v5.4h, v4.4h -; CHECK-DOT-NEXT: add v0.2s, v1.2s, v0.2s +; CHECK-DOT-NEXT: movi v3.8b, #128 +; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000 +; CHECK-DOT-NEXT: eor v2.8b, v2.8b, v3.8b +; CHECK-DOT-NEXT: udot v4.2s, v3.8b, v1.8b +; CHECK-DOT-NEXT: udot v0.2s, v2.8b, v1.8b +; CHECK-DOT-NEXT: sub v0.2s, v0.2s, v4.2s ; CHECK-DOT-NEXT: ret ; ; CHECK-DOT-I8MM-LABEL: usdot_narrow: @@ -377,14 +453,12 @@ define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{ ; ; CHECK-DOT-LABEL: sudot: ; CHECK-DOT: // %bb.0: -; CHECK-DOT-NEXT: sshll v3.8h, v1.8b, #0 -; CHECK-DOT-NEXT: ushll v4.8h, v2.8b, #0 -; CHECK-DOT-NEXT: sshll2 v1.8h, v1.16b, #0 -; CHECK-DOT-NEXT: ushll2 v2.8h, v2.16b, #0 -; CHECK-DOT-NEXT: smlal v0.4s, v4.4h, v3.4h -; CHECK-DOT-NEXT: smlal2 v0.4s, v4.8h, v3.8h -; CHECK-DOT-NEXT: smlal v0.4s, v2.4h, v1.4h -; CHECK-DOT-NEXT: smlal2 v0.4s, v2.8h, v1.8h +; CHECK-DOT-NEXT: movi v3.16b, #128 +; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000 +; CHECK-DOT-NEXT: eor v1.16b, v1.16b, v3.16b +; CHECK-DOT-NEXT: udot v4.4s, v3.16b, v2.16b +; CHECK-DOT-NEXT: udot v0.4s, v1.16b, v2.16b +; CHECK-DOT-NEXT: sub v0.4s, v0.4s, v4.4s ; CHECK-DOT-NEXT: ret ; ; CHECK-DOT-I8MM-LABEL: sudot: @@ -403,7 +477,7 @@ define <4 x i32> @sudot_in_loop(ptr %p1, ptr %p2){ ; CHECK-NODOT: // %bb.0: // %entry ; CHECK-NODOT-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NODOT-NEXT: mov x8, xzr -; CHECK-NODOT-NEXT: .LBB9_1: // %vector.body +; CHECK-NODOT-NEXT: .LBB10_1: // %vector.body ; CHECK-NODOT-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NODOT-NEXT: ldr q2, [x0, x8] ; CHECK-NODOT-NEXT: ldr q3, [x1, x8] @@ -418,30 +492,29 @@ define <4 x i32> @sudot_in_loop(ptr %p1, ptr %p2){ ; CHECK-NODOT-NEXT: smlal2 v1.4s, v4.8h, v5.8h ; CHECK-NODOT-NEXT: smlal v1.4s, v2.4h, v3.4h ; CHECK-NODOT-NEXT: smlal2 v1.4s, v2.8h, v3.8h -; CHECK-NODOT-NEXT: b.ne .LBB9_1 +; CHECK-NODOT-NEXT: b.ne .LBB10_1 ; CHECK-NODOT-NEXT: // %bb.2: // %end ; CHECK-NODOT-NEXT: ret ; ; CHECK-DOT-LABEL: sudot_in_loop: ; CHECK-DOT: // %bb.0: // %entry -; CHECK-DOT-NEXT: movi v1.2d, #0000000000000000 +; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000 +; CHECK-DOT-NEXT: movi v1.16b, #128 ; CHECK-DOT-NEXT: mov x8, xzr -; CHECK-DOT-NEXT: .LBB9_1: // %vector.body +; CHECK-DOT-NEXT: .LBB10_1: // %vector.body ; CHECK-DOT-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-DOT-NEXT: ldr q0, [x1, x8] +; CHECK-DOT-NEXT: movi v3.2d, #0000000000000000 +; CHECK-DOT-NEXT: eor v4.16b, v0.16b, v1.16b +; CHECK-DOT-NEXT: mov v0.16b, v2.16b ; CHECK-DOT-NEXT: ldr q2, [x0, x8] -; CHECK-DOT-NEXT: ldr q3, [x1, x8] -; CHECK-DOT-NEXT: mov v0.16b, v1.16b ; CHECK-DOT-NEXT: add x8, x8, #16 -; CHECK-DOT-NEXT: ushll v4.8h, v2.8b, #0 -; CHECK-DOT-NEXT: sshll v5.8h, v3.8b, #0 -; CHECK-DOT-NEXT: ushll2 v2.8h, v2.16b, #0 -; CHECK-DOT-NEXT: sshll2 v3.8h, v3.16b, #0 +; CHECK-DOT-NEXT: udot v3.4s, v1.16b, v2.16b ; CHECK-DOT-NEXT: cmp x8, #16 -; CHECK-DOT-NEXT: smlal v1.4s, v4.4h, v5.4h -; CHECK-DOT-NEXT: smlal2 v1.4s, v4.8h, v5.8h -; CHECK-DOT-NEXT: smlal v1.4s, v2.4h, v3.4h -; CHECK-DOT-NEXT: smlal2 v1.4s, v2.8h, v3.8h -; CHECK-DOT-NEXT: b.ne .LBB9_1 +; CHECK-DOT-NEXT: mov v5.16b, v0.16b +; CHECK-DOT-NEXT: udot v5.4s, v4.16b, v2.16b +; CHECK-DOT-NEXT: sub v2.4s, v5.4s, v3.4s +; CHECK-DOT-NEXT: b.ne .LBB10_1 ; CHECK-DOT-NEXT: // %bb.2: // %end ; CHECK-DOT-NEXT: ret ; @@ -449,7 +522,7 @@ define <4 x i32> @sudot_in_loop(ptr %p1, ptr %p2){ ; CHECK-DOT-I8MM: // %bb.0: // %entry ; CHECK-DOT-I8MM-NEXT: movi v1.2d, #0000000000000000 ; CHECK-DOT-I8MM-NEXT: mov x8, xzr -; CHECK-DOT-I8MM-NEXT: .LBB9_1: // %vector.body +; CHECK-DOT-I8MM-NEXT: .LBB10_1: // %vector.body ; CHECK-DOT-I8MM-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-DOT-I8MM-NEXT: ldr q2, [x0, x8] ; CHECK-DOT-I8MM-NEXT: ldr q3, [x1, x8] @@ -457,7 +530,7 @@ define <4 x i32> @sudot_in_loop(ptr %p1, ptr %p2){ ; CHECK-DOT-I8MM-NEXT: add x8, x8, #16 ; CHECK-DOT-I8MM-NEXT: usdot v1.4s, v2.16b, v3.16b ; CHECK-DOT-I8MM-NEXT: cmp x8, #16 -; CHECK-DOT-I8MM-NEXT: b.ne .LBB9_1 +; CHECK-DOT-I8MM-NEXT: b.ne .LBB10_1 ; CHECK-DOT-I8MM-NEXT: // %bb.2: // %end ; CHECK-DOT-I8MM-NEXT: ret entry: @@ -502,19 +575,12 @@ define <2 x i32> @sudot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{ ; ; CHECK-DOT-LABEL: sudot_narrow: ; CHECK-DOT: // %bb.0: -; CHECK-DOT-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-DOT-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-DOT-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-DOT-NEXT: smull v3.4s, v2.4h, v1.4h -; CHECK-DOT-NEXT: smlal v0.4s, v2.4h, v1.4h -; CHECK-DOT-NEXT: mov d4, v1.d[1] -; CHECK-DOT-NEXT: mov d5, v2.d[1] -; CHECK-DOT-NEXT: smull2 v1.4s, v2.8h, v1.8h -; CHECK-DOT-NEXT: mov d3, v3.d[1] -; CHECK-DOT-NEXT: mov d1, v1.d[1] -; CHECK-DOT-NEXT: add v0.2s, v3.2s, v0.2s -; CHECK-DOT-NEXT: smlal v0.4s, v5.4h, v4.4h -; CHECK-DOT-NEXT: add v0.2s, v1.2s, v0.2s +; CHECK-DOT-NEXT: movi v3.8b, #128 +; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000 +; CHECK-DOT-NEXT: eor v1.8b, v1.8b, v3.8b +; CHECK-DOT-NEXT: udot v4.2s, v3.8b, v2.8b +; CHECK-DOT-NEXT: udot v0.2s, v1.8b, v2.8b +; CHECK-DOT-NEXT: sub v0.2s, v0.2s, v4.2s ; CHECK-DOT-NEXT: ret ; ; CHECK-DOT-I8MM-LABEL: sudot_narrow: @@ -641,26 +707,15 @@ define <4 x i64> @usdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){ ; ; CHECK-DOT-LABEL: usdot_8to64: ; CHECK-DOT: // %bb.0: // %entry -; CHECK-DOT-NEXT: ushll v4.8h, v2.8b, #0 -; CHECK-DOT-NEXT: ushll2 v2.8h, v2.16b, #0 -; CHECK-DOT-NEXT: sshll v5.8h, v3.8b, #0 -; CHECK-DOT-NEXT: sshll2 v3.8h, v3.16b, #0 -; CHECK-DOT-NEXT: ushll v6.4s, v4.4h, #0 -; CHECK-DOT-NEXT: ushll v7.4s, v2.4h, #0 -; CHECK-DOT-NEXT: sshll v16.4s, v5.4h, #0 -; CHECK-DOT-NEXT: sshll v17.4s, v3.4h, #0 -; CHECK-DOT-NEXT: ushll2 v4.4s, v4.8h, #0 -; CHECK-DOT-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-DOT-NEXT: sshll2 v5.4s, v5.8h, #0 -; CHECK-DOT-NEXT: sshll2 v3.4s, v3.8h, #0 -; CHECK-DOT-NEXT: smlal v0.2d, v6.2s, v16.2s -; CHECK-DOT-NEXT: smlal v1.2d, v7.2s, v17.2s -; CHECK-DOT-NEXT: smlal2 v0.2d, v6.4s, v16.4s -; CHECK-DOT-NEXT: smlal2 v1.2d, v7.4s, v17.4s -; CHECK-DOT-NEXT: smlal v0.2d, v4.2s, v5.2s -; CHECK-DOT-NEXT: smlal v1.2d, v2.2s, v3.2s -; CHECK-DOT-NEXT: smlal2 v0.2d, v4.4s, v5.4s -; CHECK-DOT-NEXT: smlal2 v1.2d, v2.4s, v3.4s +; CHECK-DOT-NEXT: movi v4.16b, #128 +; CHECK-DOT-NEXT: movi v5.2d, #0000000000000000 +; CHECK-DOT-NEXT: movi v6.2d, #0000000000000000 +; CHECK-DOT-NEXT: eor v3.16b, v3.16b, v4.16b +; CHECK-DOT-NEXT: udot v6.4s, v4.16b, v2.16b +; CHECK-DOT-NEXT: udot v5.4s, v3.16b, v2.16b +; CHECK-DOT-NEXT: sub v2.4s, v5.4s, v6.4s +; CHECK-DOT-NEXT: saddw v0.2d, v0.2d, v2.2s +; CHECK-DOT-NEXT: saddw2 v0.2d, v0.2d, v2.4s ; CHECK-DOT-NEXT: ret ; ; CHECK-DOT-I8MM-LABEL: usdot_8to64: @@ -706,26 +761,15 @@ define <4 x i64> @sudot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) { ; ; CHECK-DOT-LABEL: sudot_8to64: ; CHECK-DOT: // %bb.0: // %entry -; CHECK-DOT-NEXT: sshll v4.8h, v2.8b, #0 -; CHECK-DOT-NEXT: sshll2 v2.8h, v2.16b, #0 -; CHECK-DOT-NEXT: ushll v5.8h, v3.8b, #0 -; CHECK-DOT-NEXT: ushll2 v3.8h, v3.16b, #0 -; CHECK-DOT-NEXT: sshll v6.4s, v4.4h, #0 -; CHECK-DOT-NEXT: sshll v7.4s, v2.4h, #0 -; CHECK-DOT-NEXT: ushll v16.4s, v5.4h, #0 -; CHECK-DOT-NEXT: ushll v17.4s, v3.4h, #0 -; CHECK-DOT-NEXT: sshll2 v4.4s, v4.8h, #0 -; CHECK-DOT-NEXT: sshll2 v2.4s, v2.8h, #0 -; CHECK-DOT-NEXT: ushll2 v5.4s, v5.8h, #0 -; CHECK-DOT-NEXT: ushll2 v3.4s, v3.8h, #0 -; CHECK-DOT-NEXT: smlal v0.2d, v6.2s, v16.2s -; CHECK-DOT-NEXT: smlal v1.2d, v7.2s, v17.2s -; CHECK-DOT-NEXT: smlal2 v0.2d, v6.4s, v16.4s -; CHECK-DOT-NEXT: smlal2 v1.2d, v7.4s, v17.4s -; CHECK-DOT-NEXT: smlal v0.2d, v4.2s, v5.2s -; CHECK-DOT-NEXT: smlal v1.2d, v2.2s, v3.2s -; CHECK-DOT-NEXT: smlal2 v0.2d, v4.4s, v5.4s -; CHECK-DOT-NEXT: smlal2 v1.2d, v2.4s, v3.4s +; CHECK-DOT-NEXT: movi v4.16b, #128 +; CHECK-DOT-NEXT: movi v5.2d, #0000000000000000 +; CHECK-DOT-NEXT: movi v6.2d, #0000000000000000 +; CHECK-DOT-NEXT: eor v2.16b, v2.16b, v4.16b +; CHECK-DOT-NEXT: udot v6.4s, v4.16b, v3.16b +; CHECK-DOT-NEXT: udot v5.4s, v2.16b, v3.16b +; CHECK-DOT-NEXT: sub v2.4s, v5.4s, v6.4s +; CHECK-DOT-NEXT: saddw v0.2d, v0.2d, v2.2s +; CHECK-DOT-NEXT: saddw2 v0.2d, v0.2d, v2.4s ; CHECK-DOT-NEXT: ret ; ; CHECK-DOT-I8MM-LABEL: sudot_8to64: @@ -776,7 +820,7 @@ define <4 x i32> @udot_no_bin_op_in_loop(ptr %p){ ; CHECK-NODOT: // %bb.0: // %entry ; CHECK-NODOT-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NODOT-NEXT: mov x8, xzr -; CHECK-NODOT-NEXT: .LBB16_1: // %vector.body +; CHECK-NODOT-NEXT: .LBB17_1: // %vector.body ; CHECK-NODOT-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NODOT-NEXT: ldr q2, [x0, x8] ; CHECK-NODOT-NEXT: mov v0.16b, v1.16b @@ -788,7 +832,7 @@ define <4 x i32> @udot_no_bin_op_in_loop(ptr %p){ ; CHECK-NODOT-NEXT: uaddw2 v1.4s, v1.4s, v3.8h ; CHECK-NODOT-NEXT: uaddw v1.4s, v1.4s, v2.4h ; CHECK-NODOT-NEXT: uaddw2 v1.4s, v1.4s, v2.8h -; CHECK-NODOT-NEXT: b.ne .LBB16_1 +; CHECK-NODOT-NEXT: b.ne .LBB17_1 ; CHECK-NODOT-NEXT: // %bb.2: // %end ; CHECK-NODOT-NEXT: ret ; @@ -797,14 +841,14 @@ define <4 x i32> @udot_no_bin_op_in_loop(ptr %p){ ; CHECK-DOT-NEXT: movi v1.2d, #0000000000000000 ; CHECK-DOT-NEXT: movi v2.16b, #1 ; CHECK-DOT-NEXT: mov x8, xzr -; CHECK-DOT-NEXT: .LBB16_1: // %vector.body +; CHECK-DOT-NEXT: .LBB17_1: // %vector.body ; CHECK-DOT-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-DOT-NEXT: ldr q3, [x0, x8] ; CHECK-DOT-NEXT: mov v0.16b, v1.16b ; CHECK-DOT-NEXT: add x8, x8, #16 ; CHECK-DOT-NEXT: cmp x8, #16 ; CHECK-DOT-NEXT: udot v1.4s, v3.16b, v2.16b -; CHECK-DOT-NEXT: b.ne .LBB16_1 +; CHECK-DOT-NEXT: b.ne .LBB17_1 ; CHECK-DOT-NEXT: // %bb.2: // %end ; CHECK-DOT-NEXT: ret ; @@ -813,14 +857,14 @@ define <4 x i32> @udot_no_bin_op_in_loop(ptr %p){ ; CHECK-DOT-I8MM-NEXT: movi v1.2d, #0000000000000000 ; CHECK-DOT-I8MM-NEXT: movi v2.16b, #1 ; CHECK-DOT-I8MM-NEXT: mov x8, xzr -; CHECK-DOT-I8MM-NEXT: .LBB16_1: // %vector.body +; CHECK-DOT-I8MM-NEXT: .LBB17_1: // %vector.body ; CHECK-DOT-I8MM-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-DOT-I8MM-NEXT: ldr q3, [x0, x8] ; CHECK-DOT-I8MM-NEXT: mov v0.16b, v1.16b ; CHECK-DOT-I8MM-NEXT: add x8, x8, #16 ; CHECK-DOT-I8MM-NEXT: cmp x8, #16 ; CHECK-DOT-I8MM-NEXT: udot v1.4s, v3.16b, v2.16b -; CHECK-DOT-I8MM-NEXT: b.ne .LBB16_1 +; CHECK-DOT-I8MM-NEXT: b.ne .LBB17_1 ; CHECK-DOT-I8MM-NEXT: // %bb.2: // %end ; CHECK-DOT-I8MM-NEXT: ret entry: @@ -1136,7 +1180,7 @@ define <4 x i32> @usdot_multiple_zext_users(ptr %p1, ptr %p2, ptr %p3) { ; CHECK-NODOT-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NODOT-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NODOT-NEXT: mov x8, xzr -; CHECK-NODOT-NEXT: .LBB28_1: // %vector.body +; CHECK-NODOT-NEXT: .LBB29_1: // %vector.body ; CHECK-NODOT-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NODOT-NEXT: ldr q2, [x0, x8] ; CHECK-NODOT-NEXT: ldr q3, [x1, x8] @@ -1157,7 +1201,7 @@ define <4 x i32> @usdot_multiple_zext_users(ptr %p1, ptr %p2, ptr %p3) { ; CHECK-NODOT-NEXT: smlal v1.4s, v3.4h, v4.4h ; CHECK-NODOT-NEXT: smlal2 v0.4s, v2.8h, v4.8h ; CHECK-NODOT-NEXT: smlal2 v1.4s, v3.8h, v4.8h -; CHECK-NODOT-NEXT: b.ne .LBB28_1 +; CHECK-NODOT-NEXT: b.ne .LBB29_1 ; CHECK-NODOT-NEXT: // %bb.2: // %end ; CHECK-NODOT-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NODOT-NEXT: ret @@ -1165,32 +1209,27 @@ define <4 x i32> @usdot_multiple_zext_users(ptr %p1, ptr %p2, ptr %p3) { ; CHECK-DOT-LABEL: usdot_multiple_zext_users: ; CHECK-DOT: // %bb.0: // %entry ; CHECK-DOT-NEXT: movi v0.2d, #0000000000000000 -; CHECK-DOT-NEXT: movi v1.2d, #0000000000000000 +; CHECK-DOT-NEXT: movi v1.16b, #128 ; CHECK-DOT-NEXT: mov x8, xzr -; CHECK-DOT-NEXT: .LBB28_1: // %vector.body +; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000 +; CHECK-DOT-NEXT: .LBB29_1: // %vector.body ; CHECK-DOT-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-DOT-NEXT: ldr q2, [x0, x8] -; CHECK-DOT-NEXT: ldr q3, [x1, x8] -; CHECK-DOT-NEXT: ldr q4, [x2, x8] +; CHECK-DOT-NEXT: ldr q3, [x0, x8] +; CHECK-DOT-NEXT: ldr q4, [x1, x8] +; CHECK-DOT-NEXT: movi v5.2d, #0000000000000000 +; CHECK-DOT-NEXT: ldr q6, [x2, x8] ; CHECK-DOT-NEXT: add x8, x8, #16 -; CHECK-DOT-NEXT: sshll v5.8h, v2.8b, #0 -; CHECK-DOT-NEXT: ushll v6.8h, v4.8b, #0 -; CHECK-DOT-NEXT: sshll v7.8h, v3.8b, #0 -; CHECK-DOT-NEXT: sshll2 v2.8h, v2.16b, #0 -; CHECK-DOT-NEXT: ushll2 v4.8h, v4.16b, #0 -; CHECK-DOT-NEXT: sshll2 v3.8h, v3.16b, #0 +; CHECK-DOT-NEXT: eor v3.16b, v3.16b, v1.16b +; CHECK-DOT-NEXT: eor v4.16b, v4.16b, v1.16b ; CHECK-DOT-NEXT: cmp x8, #1024 -; CHECK-DOT-NEXT: smlal v0.4s, v5.4h, v6.4h -; CHECK-DOT-NEXT: smlal v1.4s, v7.4h, v6.4h -; CHECK-DOT-NEXT: smlal2 v0.4s, v5.8h, v6.8h -; CHECK-DOT-NEXT: smlal2 v1.4s, v7.8h, v6.8h -; CHECK-DOT-NEXT: smlal v0.4s, v2.4h, v4.4h -; CHECK-DOT-NEXT: smlal v1.4s, v3.4h, v4.4h -; CHECK-DOT-NEXT: smlal2 v0.4s, v2.8h, v4.8h -; CHECK-DOT-NEXT: smlal2 v1.4s, v3.8h, v4.8h -; CHECK-DOT-NEXT: b.ne .LBB28_1 +; CHECK-DOT-NEXT: udot v5.4s, v1.16b, v6.16b +; CHECK-DOT-NEXT: udot v0.4s, v3.16b, v6.16b +; CHECK-DOT-NEXT: udot v2.4s, v4.16b, v6.16b +; CHECK-DOT-NEXT: sub v0.4s, v0.4s, v5.4s +; CHECK-DOT-NEXT: sub v2.4s, v2.4s, v5.4s +; CHECK-DOT-NEXT: b.ne .LBB29_1 ; CHECK-DOT-NEXT: // %bb.2: // %end -; CHECK-DOT-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-DOT-NEXT: add v0.4s, v2.4s, v0.4s ; CHECK-DOT-NEXT: ret ; ; CHECK-DOT-I8MM-LABEL: usdot_multiple_zext_users: @@ -1198,7 +1237,7 @@ define <4 x i32> @usdot_multiple_zext_users(ptr %p1, ptr %p2, ptr %p3) { ; CHECK-DOT-I8MM-NEXT: movi v0.2d, #0000000000000000 ; CHECK-DOT-I8MM-NEXT: movi v1.2d, #0000000000000000 ; CHECK-DOT-I8MM-NEXT: mov x8, xzr -; CHECK-DOT-I8MM-NEXT: .LBB28_1: // %vector.body +; CHECK-DOT-I8MM-NEXT: .LBB29_1: // %vector.body ; CHECK-DOT-I8MM-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-DOT-I8MM-NEXT: ldr q2, [x0, x8] ; CHECK-DOT-I8MM-NEXT: ldr q3, [x1, x8] @@ -1207,7 +1246,7 @@ define <4 x i32> @usdot_multiple_zext_users(ptr %p1, ptr %p2, ptr %p3) { ; CHECK-DOT-I8MM-NEXT: usdot v0.4s, v4.16b, v2.16b ; CHECK-DOT-I8MM-NEXT: usdot v1.4s, v4.16b, v3.16b ; CHECK-DOT-I8MM-NEXT: cmp x8, #1024 -; CHECK-DOT-I8MM-NEXT: b.ne .LBB28_1 +; CHECK-DOT-I8MM-NEXT: b.ne .LBB29_1 ; CHECK-DOT-I8MM-NEXT: // %bb.2: // %end ; CHECK-DOT-I8MM-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-DOT-I8MM-NEXT: ret @@ -1500,3 +1539,114 @@ entry: %partial.reduce = tail call <2 x i32> @llvm.vector.partial.reduce.add(<2 x i32> %acc, <16 x i32> %input.wide) ret <2 x i32> %partial.reduce } + +define <2 x i64> @usdot_v16i8tov2i64(<2 x i64> %acc, <16 x i8> %u, <16 x i8> %s) { +; CHECK-NODOT-LABEL: usdot_v16i8tov2i64: +; CHECK-NODOT: // %bb.0: // %entry +; CHECK-NODOT-NEXT: ushll v3.8h, v1.8b, #0 +; CHECK-NODOT-NEXT: sshll v4.8h, v2.8b, #0 +; CHECK-NODOT-NEXT: ushll2 v1.8h, v1.16b, #0 +; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-NODOT-NEXT: ushll v5.4s, v3.4h, #0 +; CHECK-NODOT-NEXT: sshll v6.4s, v4.4h, #0 +; CHECK-NODOT-NEXT: ushll2 v3.4s, v3.8h, #0 +; CHECK-NODOT-NEXT: sshll2 v4.4s, v4.8h, #0 +; CHECK-NODOT-NEXT: smlal v0.2d, v6.2s, v5.2s +; CHECK-NODOT-NEXT: smlal2 v0.2d, v6.4s, v5.4s +; CHECK-NODOT-NEXT: smlal v0.2d, v4.2s, v3.2s +; CHECK-NODOT-NEXT: smlal2 v0.2d, v4.4s, v3.4s +; CHECK-NODOT-NEXT: ushll v3.4s, v1.4h, #0 +; CHECK-NODOT-NEXT: sshll v4.4s, v2.4h, #0 +; CHECK-NODOT-NEXT: ushll2 v1.4s, v1.8h, #0 +; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0 +; CHECK-NODOT-NEXT: smlal v0.2d, v4.2s, v3.2s +; CHECK-NODOT-NEXT: smlal2 v0.2d, v4.4s, v3.4s +; CHECK-NODOT-NEXT: smlal v0.2d, v2.2s, v1.2s +; CHECK-NODOT-NEXT: smlal2 v0.2d, v2.4s, v1.4s +; CHECK-NODOT-NEXT: ret +; +; CHECK-DOT-LABEL: usdot_v16i8tov2i64: +; CHECK-DOT: // %bb.0: // %entry +; CHECK-DOT-NEXT: movi v3.16b, #128 +; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000 +; CHECK-DOT-NEXT: movi v5.2d, #0000000000000000 +; CHECK-DOT-NEXT: eor v2.16b, v2.16b, v3.16b +; CHECK-DOT-NEXT: udot v5.4s, v3.16b, v1.16b +; CHECK-DOT-NEXT: udot v4.4s, v2.16b, v1.16b +; CHECK-DOT-NEXT: sub v1.4s, v4.4s, v5.4s +; CHECK-DOT-NEXT: saddw v0.2d, v0.2d, v1.2s +; CHECK-DOT-NEXT: saddw2 v0.2d, v0.2d, v1.4s +; CHECK-DOT-NEXT: ret +; +; CHECK-DOT-I8MM-LABEL: usdot_v16i8tov2i64: +; CHECK-DOT-I8MM: // %bb.0: // %entry +; CHECK-DOT-I8MM-NEXT: movi v3.2d, #0000000000000000 +; CHECK-DOT-I8MM-NEXT: usdot v3.4s, v1.16b, v2.16b +; CHECK-DOT-I8MM-NEXT: saddw v0.2d, v0.2d, v3.2s +; CHECK-DOT-I8MM-NEXT: saddw2 v0.2d, v0.2d, v3.4s +; CHECK-DOT-I8MM-NEXT: ret +entry: + %u.wide = zext <16 x i8> %u to <16 x i64> + %s.wide = sext <16 x i8> %s to <16 x i64> + %mult = mul nuw nsw <16 x i64> %s.wide, %u.wide + %partial.reduce = tail call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> %acc, <16 x i64> %mult) + ret <2 x i64> %partial.reduce +} + +define <2 x i32> @usdot_v16i8tov2i32(<2 x i32> %acc, <16 x i8> %u, <16 x i8> %s) { +; CHECK-NODOT-LABEL: usdot_v16i8tov2i32: +; CHECK-NODOT: // %bb.0: // %entry +; CHECK-NODOT-NEXT: ushll v3.8h, v1.8b, #0 +; CHECK-NODOT-NEXT: sshll v4.8h, v2.8b, #0 +; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NODOT-NEXT: ushll2 v1.8h, v1.16b, #0 +; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-NODOT-NEXT: smull v5.4s, v4.4h, v3.4h +; CHECK-NODOT-NEXT: smlal v0.4s, v4.4h, v3.4h +; CHECK-NODOT-NEXT: mov d6, v3.d[1] +; CHECK-NODOT-NEXT: mov d7, v4.d[1] +; CHECK-NODOT-NEXT: smull2 v3.4s, v4.8h, v3.8h +; CHECK-NODOT-NEXT: smull v4.4s, v2.4h, v1.4h +; CHECK-NODOT-NEXT: mov d5, v5.d[1] +; CHECK-NODOT-NEXT: mov d3, v3.d[1] +; CHECK-NODOT-NEXT: add v0.2s, v5.2s, v0.2s +; CHECK-NODOT-NEXT: mov d5, v2.d[1] +; CHECK-NODOT-NEXT: smlal v0.4s, v7.4h, v6.4h +; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s +; CHECK-NODOT-NEXT: mov d3, v4.d[1] +; CHECK-NODOT-NEXT: mov d4, v1.d[1] +; CHECK-NODOT-NEXT: smlal v0.4s, v2.4h, v1.4h +; CHECK-NODOT-NEXT: smull2 v1.4s, v2.8h, v1.8h +; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s +; CHECK-NODOT-NEXT: mov d1, v1.d[1] +; CHECK-NODOT-NEXT: smlal v0.4s, v5.4h, v4.4h +; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s +; CHECK-NODOT-NEXT: ret +; +; CHECK-DOT-LABEL: usdot_v16i8tov2i32: +; CHECK-DOT: // %bb.0: // %entry +; CHECK-DOT-NEXT: movi v3.16b, #128 +; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000 +; CHECK-DOT-NEXT: fmov d0, d0 +; CHECK-DOT-NEXT: eor v2.16b, v2.16b, v3.16b +; CHECK-DOT-NEXT: udot v4.4s, v3.16b, v1.16b +; CHECK-DOT-NEXT: udot v0.4s, v2.16b, v1.16b +; CHECK-DOT-NEXT: sub v0.4s, v0.4s, v4.4s +; CHECK-DOT-NEXT: addp v0.4s, v0.4s, v0.4s +; CHECK-DOT-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-DOT-NEXT: ret +; +; CHECK-DOT-I8MM-LABEL: usdot_v16i8tov2i32: +; CHECK-DOT-I8MM: // %bb.0: // %entry +; CHECK-DOT-I8MM-NEXT: fmov d0, d0 +; CHECK-DOT-I8MM-NEXT: usdot v0.4s, v1.16b, v2.16b +; CHECK-DOT-I8MM-NEXT: addp v0.4s, v0.4s, v0.4s +; CHECK-DOT-I8MM-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-DOT-I8MM-NEXT: ret +entry: + %u.wide = zext <16 x i8> %u to <16 x i32> + %s.wide = sext <16 x i8> %s to <16 x i32> + %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide + %partial.reduce = tail call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v16i32(<2 x i32> %acc, <16 x i32> %mult) + ret <2 x i32> %partial.reduce +}