diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td index 7322212c5bb24..fe8419301b306 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td +++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td @@ -233,6 +233,12 @@ def G_SDOT : AArch64GenericInstruction { let hasSideEffects = 0; } +def G_USDOT : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1, type0:$src2, type0:$src3); + let hasSideEffects = 0; +} + // Generic instruction for the BSP pseudo. It is expanded into BSP, which // expands into BSL/BIT/BIF after register allocation. def G_BSP : AArch64GenericInstruction { @@ -278,6 +284,7 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 9e2d698e04ae7..05a431312472e 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1855,6 +1855,8 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return LowerTriOp(AArch64::G_UDOT); case Intrinsic::aarch64_neon_sdot: return LowerTriOp(AArch64::G_SDOT); + case Intrinsic::aarch64_neon_usdot: + return LowerTriOp(AArch64::G_USDOT); case Intrinsic::aarch64_neon_sqxtn: return LowerUnaryOp(TargetOpcode::G_TRUNC_SSAT_S); case Intrinsic::aarch64_neon_sqxtun: diff --git a/llvm/test/CodeGen/AArch64/aarch64-matmul.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll index 649d0a9bfcab4..99260f0625098 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-matmul.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll @@ -1,41 +1,53 @@ -; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s -o -| FileCheck %s +; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s | FileCheck %s +; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm -global-isel < %s | FileCheck %s define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: smmla.v4i32.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smmla v0.4s, v1.16b, v2.16b +; CHECK-NEXT: ret entry: -; CHECK-LABEL: smmla.v4i32.v16i8 -; CHECK: smmla v0.4s, v1.16b, v2.16b %vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) ret <4 x i32> %vmmla1.i } define <4 x i32> @ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: ummla.v4i32.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ummla v0.4s, v1.16b, v2.16b +; CHECK-NEXT: ret entry: -; CHECK-LABEL: ummla.v4i32.v16i8 -; CHECK: ummla v0.4s, v1.16b, v2.16b %vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) ret <4 x i32> %vmmla1.i } define <4 x i32> @usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: usmmla.v4i32.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: usmmla v0.4s, v1.16b, v2.16b +; CHECK-NEXT: ret entry: -; CHECK-LABEL: usmmla.v4i32.v16i8 -; CHECK: usmmla v0.4s, v1.16b, v2.16b %vusmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3 ret <4 x i32> %vusmmla1.i } define <2 x i32> @usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: usdot.v2i32.v8i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: usdot v0.2s, v1.8b, v2.8b +; CHECK-NEXT: ret entry: -; CHECK-LABEL: usdot.v2i32.v8i8 -; CHECK: usdot v0.2s, v1.8b, v2.8b %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) ret <2 x i32> %vusdot1.i } define <2 x i32> @usdot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: usdot_lane.v2i32.v8i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: usdot v0.2s, v1.8b, v2.4b[0] +; CHECK-NEXT: ret entry: -; CHECK-LABEL: usdot_lane.v2i32.v8i8 -; CHECK: usdot v0.2s, v1.8b, v2.4b[0] %0 = bitcast <8 x i8> %b to <2 x i32> %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer %1 = bitcast <2 x i32> %shuffle to <8 x i8> @@ -44,9 +56,12 @@ entry: } define <2 x i32> @sudot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: sudot_lane.v2i32.v8i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: sudot v0.2s, v1.8b, v2.4b[0] +; CHECK-NEXT: ret entry: -; CHECK-LABEL: sudot_lane.v2i32.v8i8 -; CHECK: sudot v0.2s, v1.8b, v2.4b[0] %0 = bitcast <8 x i8> %b to <2 x i32> %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer %1 = bitcast <2 x i32> %shuffle to <8 x i8> @@ -55,9 +70,11 @@ entry: } define <2 x i32> @usdot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: usdot_lane.v2i32.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: usdot v0.2s, v1.8b, v2.4b[0] +; CHECK-NEXT: ret entry: -; CHECK-LABEL: usdot_lane.v2i32.v16i8 -; CHECK: usdot v0.2s, v1.8b, v2.4b[0] %0 = bitcast <16 x i8> %b to <4 x i32> %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer %1 = bitcast <2 x i32> %shuffle to <8 x i8> @@ -66,9 +83,11 @@ entry: } define <2 x i32> @sudot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: sudot_lane.v2i32.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sudot v0.2s, v1.8b, v2.4b[0] +; CHECK-NEXT: ret entry: -; CHECK-LABEL: sudot_lane.v2i32.v16i8 -; CHECK: sudot v0.2s, v1.8b, v2.4b[0] %0 = bitcast <16 x i8> %b to <4 x i32> %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer %1 = bitcast <2 x i32> %shuffle to <8 x i8> @@ -77,17 +96,22 @@ entry: } define <4 x i32> @usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: usdot.v4i32.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: usdot v0.4s, v1.16b, v2.16b +; CHECK-NEXT: ret entry: -; CHECK-LABEL: usdot.v4i32.v16i8 -; CHECK: usdot v0.4s, v1.16b, v2.16b %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3 ret <4 x i32> %vusdot1.i } define <4 x i32> @usdot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: usdot_lane.v4i32.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: usdot v0.4s, v1.16b, v2.4b[0] +; CHECK-NEXT: ret entry: -; CHECK-LABEL: usdot_lane.v4i32.v16i8 -; CHECK: usdot v0.4s, v1.16b, v2.4b[0] %0 = bitcast <8 x i8> %b to <2 x i32> %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer %1 = bitcast <4 x i32> %shuffle to <16 x i8> @@ -96,9 +120,12 @@ entry: } define <4 x i32> @sudot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: sudot_lane.v4i32.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: sudot v0.4s, v1.16b, v2.4b[0] +; CHECK-NEXT: ret entry: -; CHECK-LABEL: sudot_lane.v4i32.v16i8 -; CHECK: sudot v0.4s, v1.16b, v2.4b[0] %0 = bitcast <8 x i8> %b to <2 x i32> %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer %1 = bitcast <4 x i32> %shuffle to <16 x i8> @@ -107,9 +134,11 @@ entry: } define <4 x i32> @usdot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: usdot_laneq.v4i32.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: usdot v0.4s, v1.16b, v2.4b[0] +; CHECK-NEXT: ret entry: -; CHECK-LABEL: usdot_laneq.v4i32.v16i8 -; CHECK: usdot v0.4s, v1.16b, v2.4b[0] %0 = bitcast <16 x i8> %b to <4 x i32> %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer %1 = bitcast <4 x i32> %shuffle to <16 x i8> @@ -118,9 +147,11 @@ entry: } define <4 x i32> @sudot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: sudot_laneq.v4i32.v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sudot v0.4s, v1.16b, v2.4b[0] +; CHECK-NEXT: ret entry: -; CHECK-LABEL: sudot_laneq.v4i32.v16i8 -; CHECK: sudot v0.4s, v1.16b, v2.4b[0] %0 = bitcast <16 x i8> %b to <4 x i32> %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer %1 = bitcast <4 x i32> %shuffle to <16 x i8> @@ -134,3 +165,6 @@ declare <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <1 declare <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) #2 declare <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2 +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-GI: {{.*}} +; CHECK-SD: {{.*}}