Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21813,7 +21813,8 @@ static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
// Handle commutivity
auto isZeroDot = [](SDValue Dot) {
return (Dot.getOpcode() == AArch64ISD::UDOT ||
Dot.getOpcode() == AArch64ISD::SDOT) &&
Dot.getOpcode() == AArch64ISD::SDOT ||
Dot.getOpcode() == AArch64ISD::USDOT) &&
isZerosVector(Dot.getOperand(0).getNode());
};
if (!isZeroDot(Dot))
Expand Down
40 changes: 38 additions & 2 deletions llvm/test/CodeGen/AArch64/aarch64-matmul.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s | FileCheck %s
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm -global-isel < %s | FileCheck %s
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI

define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: smmla.v4i32.v16i8:
Expand Down Expand Up @@ -160,6 +160,42 @@ entry:
ret <4 x i32> %vusdot1.i
}

define <2 x i32> @usdot_add_zero.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
; CHECK-SD-LABEL: usdot_add_zero.v2i32.v8i8:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: usdot v0.2s, v1.8b, v2.8b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: usdot_add_zero.v2i32.v8i8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-NEXT: usdot v3.2s, v1.8b, v2.8b
; CHECK-GI-NEXT: add v0.2s, v3.2s, v0.2s
; CHECK-GI-NEXT: ret
entry:
%x = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %a, <8 x i8> %b)
%y = add <2 x i32> %x, %r
ret <2 x i32> %y
}

define <4 x i32> @usdot_add_zero.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
; CHECK-SD-LABEL: usdot_add_zero.v4i32.v16i8:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: usdot v0.4s, v1.16b, v2.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: usdot_add_zero.v4i32.v16i8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-NEXT: usdot v3.4s, v1.16b, v2.16b
; CHECK-GI-NEXT: add v0.4s, v3.4s, v0.4s
; CHECK-GI-NEXT: ret
entry:
%x = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %a, <16 x i8> %b)
%y = add <4 x i32> %x, %r
ret <4 x i32> %y
}

declare <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
declare <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
declare <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
Expand Down
114 changes: 44 additions & 70 deletions llvm/test/CodeGen/AArch64/neon-dotreduce.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1375,11 +1375,9 @@ define i32 @test_usdot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i
; CHECK-SD-LABEL: test_usdot_v8i8_double:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
; CHECK-SD-NEXT: usdot v5.2s, v0.8b, v1.8b
; CHECK-SD-NEXT: usdot v4.2s, v2.8b, v3.8b
; CHECK-SD-NEXT: add v0.2s, v5.2s, v4.2s
; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s
; CHECK-SD-NEXT: usdot v4.2s, v0.8b, v1.8b
; CHECK-SD-NEXT: addp v0.2s, v4.2s, v4.2s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
Expand Down Expand Up @@ -1416,11 +1414,9 @@ define i32 @test_usdot_swapped_operands_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8
; CHECK-SD-LABEL: test_usdot_swapped_operands_v8i8_double:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
; CHECK-SD-NEXT: usdot v5.2s, v1.8b, v0.8b
; CHECK-SD-NEXT: usdot v4.2s, v3.8b, v2.8b
; CHECK-SD-NEXT: add v0.2s, v5.2s, v4.2s
; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s
; CHECK-SD-NEXT: usdot v4.2s, v1.8b, v0.8b
; CHECK-SD-NEXT: addp v0.2s, v4.2s, v4.2s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
Expand Down Expand Up @@ -1457,11 +1453,9 @@ define i32 @test_usdot_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <1
; CHECK-SD-LABEL: test_usdot_v16i8_double:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
; CHECK-SD-NEXT: usdot v5.4s, v0.16b, v1.16b
; CHECK-SD-NEXT: usdot v4.4s, v2.16b, v3.16b
; CHECK-SD-NEXT: add v0.4s, v5.4s, v4.4s
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: usdot v4.4s, v0.16b, v1.16b
; CHECK-SD-NEXT: addv s0, v4.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
Expand Down Expand Up @@ -1509,11 +1503,9 @@ define i32 @test_usdot_swapped_operands_v16i8_double(<16 x i8> %a, <16 x i8> %b,
; CHECK-SD-LABEL: test_usdot_swapped_operands_v16i8_double:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
; CHECK-SD-NEXT: usdot v5.4s, v1.16b, v0.16b
; CHECK-SD-NEXT: usdot v4.4s, v3.16b, v2.16b
; CHECK-SD-NEXT: add v0.4s, v5.4s, v4.4s
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: usdot v4.4s, v1.16b, v0.16b
; CHECK-SD-NEXT: addv s0, v4.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
Expand Down Expand Up @@ -4384,12 +4376,10 @@ define i32 @test_usdot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %
; CHECK-SD-LABEL: test_usdot_v32i8:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-NEXT: ldp q2, q3, [x0]
; CHECK-SD-NEXT: ldp q4, q5, [x1]
; CHECK-SD-NEXT: usdot v1.4s, v3.16b, v5.16b
; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v4.16b
; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-NEXT: ldp q1, q3, [x0]
; CHECK-SD-NEXT: ldp q2, q4, [x1]
; CHECK-SD-NEXT: usdot v0.4s, v3.16b, v4.16b
; CHECK-SD-NEXT: usdot v0.4s, v1.16b, v2.16b
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w8, s0
; CHECK-SD-NEXT: add w0, w8, w2
Expand Down Expand Up @@ -4438,15 +4428,11 @@ define i32 @test_usdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <3
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v16.2d, #0000000000000000
; CHECK-SD-NEXT: movi v17.2d, #0000000000000000
; CHECK-SD-NEXT: movi v18.2d, #0000000000000000
; CHECK-SD-NEXT: movi v19.2d, #0000000000000000
; CHECK-SD-NEXT: usdot v16.4s, v1.16b, v3.16b
; CHECK-SD-NEXT: usdot v18.4s, v0.16b, v2.16b
; CHECK-SD-NEXT: usdot v17.4s, v4.16b, v6.16b
; CHECK-SD-NEXT: usdot v19.4s, v5.16b, v7.16b
; CHECK-SD-NEXT: add v0.4s, v18.4s, v16.4s
; CHECK-SD-NEXT: add v1.4s, v17.4s, v19.4s
; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-NEXT: usdot v17.4s, v1.16b, v3.16b
; CHECK-SD-NEXT: usdot v16.4s, v5.16b, v7.16b
; CHECK-SD-NEXT: usdot v17.4s, v0.16b, v2.16b
; CHECK-SD-NEXT: usdot v16.4s, v4.16b, v6.16b
; CHECK-SD-NEXT: add v0.4s, v17.4s, v16.4s
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
Expand Down Expand Up @@ -8781,20 +8767,16 @@ define i32 @test_usdot_v64i8(ptr nocapture readonly %a, ptr nocapture readonly %
; CHECK-SD-LABEL: test_usdot_v64i8:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-NEXT: movi v3.2d, #0000000000000000
; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
; CHECK-SD-NEXT: ldp q1, q2, [x0, #32]
; CHECK-SD-NEXT: ldp q6, q7, [x1, #32]
; CHECK-SD-NEXT: ldp q16, q17, [x0]
; CHECK-SD-NEXT: ldp q18, q19, [x1]
; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v7.16b
; CHECK-SD-NEXT: usdot v5.4s, v1.16b, v6.16b
; CHECK-SD-NEXT: usdot v4.4s, v17.16b, v19.16b
; CHECK-SD-NEXT: usdot v3.4s, v16.16b, v18.16b
; CHECK-SD-NEXT: add v0.4s, v4.4s, v0.4s
; CHECK-SD-NEXT: add v1.4s, v3.4s, v5.4s
; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-NEXT: ldp q2, q3, [x0, #32]
; CHECK-SD-NEXT: ldp q4, q5, [x1, #32]
; CHECK-SD-NEXT: usdot v1.4s, v3.16b, v5.16b
; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v4.16b
; CHECK-SD-NEXT: ldp q2, q3, [x0]
; CHECK-SD-NEXT: ldp q4, q5, [x1]
; CHECK-SD-NEXT: usdot v1.4s, v3.16b, v5.16b
; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v4.16b
; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w8, s0
; CHECK-SD-NEXT: add w0, w8, w2
Expand Down Expand Up @@ -8863,32 +8845,24 @@ entry:
define i32 @test_usdot_v64i8_double(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
; CHECK-SD-LABEL: test_usdot_v64i8_double:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v16.2d, #0000000000000000
; CHECK-SD-NEXT: movi v17.2d, #0000000000000000
; CHECK-SD-NEXT: movi v18.2d, #0000000000000000
; CHECK-SD-NEXT: movi v21.2d, #0000000000000000
; CHECK-SD-NEXT: movi v22.2d, #0000000000000000
; CHECK-SD-NEXT: movi v23.2d, #0000000000000000
; CHECK-SD-NEXT: ldp q16, q17, [sp, #64]
; CHECK-SD-NEXT: movi v24.2d, #0000000000000000
; CHECK-SD-NEXT: movi v25.2d, #0000000000000000
; CHECK-SD-NEXT: movi v26.2d, #0000000000000000
; CHECK-SD-NEXT: movi v27.2d, #0000000000000000
; CHECK-SD-NEXT: ldp q19, q20, [sp, #96]
; CHECK-SD-NEXT: usdot v18.4s, v3.16b, v7.16b
; CHECK-SD-NEXT: ldp q3, q7, [sp, #32]
; CHECK-SD-NEXT: usdot v21.4s, v1.16b, v5.16b
; CHECK-SD-NEXT: ldp q1, q5, [sp]
; CHECK-SD-NEXT: usdot v22.4s, v2.16b, v6.16b
; CHECK-SD-NEXT: usdot v23.4s, v0.16b, v4.16b
; CHECK-SD-NEXT: usdot v24.4s, v7.16b, v20.16b
; CHECK-SD-NEXT: usdot v27.4s, v3.16b, v19.16b
; CHECK-SD-NEXT: usdot v26.4s, v5.16b, v17.16b
; CHECK-SD-NEXT: usdot v25.4s, v1.16b, v16.16b
; CHECK-SD-NEXT: add v0.4s, v21.4s, v18.4s
; CHECK-SD-NEXT: add v1.4s, v23.4s, v22.4s
; CHECK-SD-NEXT: add v2.4s, v26.4s, v24.4s
; CHECK-SD-NEXT: add v3.4s, v25.4s, v27.4s
; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-SD-NEXT: add v1.4s, v3.4s, v2.4s
; CHECK-SD-NEXT: movi v19.2d, #0000000000000000
; CHECK-SD-NEXT: ldp q20, q21, [sp, #96]
; CHECK-SD-NEXT: ldp q22, q23, [sp, #32]
; CHECK-SD-NEXT: usdot v16.4s, v3.16b, v7.16b
; CHECK-SD-NEXT: usdot v18.4s, v2.16b, v6.16b
; CHECK-SD-NEXT: usdot v19.4s, v23.16b, v21.16b
; CHECK-SD-NEXT: usdot v17.4s, v22.16b, v20.16b
; CHECK-SD-NEXT: ldp q2, q3, [sp, #64]
; CHECK-SD-NEXT: ldp q6, q7, [sp]
; CHECK-SD-NEXT: usdot v16.4s, v1.16b, v5.16b
; CHECK-SD-NEXT: usdot v18.4s, v0.16b, v4.16b
; CHECK-SD-NEXT: usdot v19.4s, v7.16b, v3.16b
; CHECK-SD-NEXT: usdot v17.4s, v6.16b, v2.16b
; CHECK-SD-NEXT: add v0.4s, v18.4s, v16.4s
; CHECK-SD-NEXT: add v1.4s, v17.4s, v19.4s
; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w0, s0
Expand Down