-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[AArch64][GlobalISel] Added usdot intrinsic support #162615
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Joshua Rodriguez (JoshdRod) ChangesGlobalISel now selects usdot intrinsic, without falling back to SDAG. Full diff: https://github.com/llvm/llvm-project/pull/162615.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index 7322212c5bb24..fe8419301b306 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -233,6 +233,12 @@ def G_SDOT : AArch64GenericInstruction {
let hasSideEffects = 0;
}
+def G_USDOT : AArch64GenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src1, type0:$src2, type0:$src3);
+ let hasSideEffects = 0;
+}
+
// Generic instruction for the BSP pseudo. It is expanded into BSP, which
// expands into BSL/BIT/BIF after register allocation.
def G_BSP : AArch64GenericInstruction {
@@ -278,6 +284,7 @@ def : GINodeEquiv<G_UADDLV, AArch64uaddlv>;
def : GINodeEquiv<G_UDOT, AArch64udot>;
def : GINodeEquiv<G_SDOT, AArch64sdot>;
+def : GINodeEquiv<G_USDOT, AArch64usdot>;
def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index ea2196a584127..acf80e1f582b6 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1855,6 +1855,8 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return LowerTriOp(AArch64::G_UDOT);
case Intrinsic::aarch64_neon_sdot:
return LowerTriOp(AArch64::G_SDOT);
+ case Intrinsic::aarch64_neon_usdot:
+ return LowerTriOp(AArch64::G_USDOT);
case Intrinsic::aarch64_neon_sqxtn:
return LowerUnaryOp(TargetOpcode::G_TRUNC_SSAT_S);
case Intrinsic::aarch64_neon_sqxtun:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index f90bcc7a77cdf..830a35bbeb494 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -590,6 +590,8 @@ bool AArch64RegisterBankInfo::onlyDefinesFP(const MachineInstr &MI,
unsigned Depth) const {
switch (MI.getOpcode()) {
case AArch64::G_DUP:
+ case AArch64::G_SADDLP:
+ case AArch64::G_UADDLP:
case TargetOpcode::G_SITOFP:
case TargetOpcode::G_UITOFP:
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
@@ -798,6 +800,8 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
if (Ty.isVector())
OpRegBankIdx[Idx] = PMI_FirstFPR;
else if (isPreISelGenericFloatingPointOpcode(Opc) ||
+ (MO.isDef() && onlyDefinesFP(MI, MRI, TRI)) ||
+ (MO.isUse() && onlyUsesFP(MI, MRI, TRI)) ||
Ty.getSizeInBits() > 64)
OpRegBankIdx[Idx] = PMI_FirstFPR;
else
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matmul.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
index 649d0a9bfcab4..5329b2d1f2e3c 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
@@ -1,41 +1,54 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s -o -| FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: smmla.v4i32.v16i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: smmla v0.4s, v1.16b, v2.16b
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: smmla.v4i32.v16i8
-; CHECK: smmla v0.4s, v1.16b, v2.16b
%vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
ret <4 x i32> %vmmla1.i
}
define <4 x i32> @ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: ummla.v4i32.v16i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ummla v0.4s, v1.16b, v2.16b
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: ummla.v4i32.v16i8
-; CHECK: ummla v0.4s, v1.16b, v2.16b
%vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
ret <4 x i32> %vmmla1.i
}
define <4 x i32> @usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: usmmla.v4i32.v16i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: usmmla v0.4s, v1.16b, v2.16b
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: usmmla.v4i32.v16i8
-; CHECK: usmmla v0.4s, v1.16b, v2.16b
%vusmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
ret <4 x i32> %vusmmla1.i
}
define <2 x i32> @usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: usdot.v2i32.v8i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: usdot v0.2s, v1.8b, v2.8b
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: usdot.v2i32.v8i8
-; CHECK: usdot v0.2s, v1.8b, v2.8b
%vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b)
ret <2 x i32> %vusdot1.i
}
define <2 x i32> @usdot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: usdot_lane.v2i32.v8i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT: usdot v0.2s, v1.8b, v2.4b[0]
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: usdot_lane.v2i32.v8i8
-; CHECK: usdot v0.2s, v1.8b, v2.4b[0]
%0 = bitcast <8 x i8> %b to <2 x i32>
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -44,9 +57,12 @@ entry:
}
define <2 x i32> @sudot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: sudot_lane.v2i32.v8i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT: sudot v0.2s, v1.8b, v2.4b[0]
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: sudot_lane.v2i32.v8i8
-; CHECK: sudot v0.2s, v1.8b, v2.4b[0]
%0 = bitcast <8 x i8> %b to <2 x i32>
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -55,9 +71,11 @@ entry:
}
define <2 x i32> @usdot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: usdot_lane.v2i32.v16i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: usdot v0.2s, v1.8b, v2.4b[0]
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: usdot_lane.v2i32.v16i8
-; CHECK: usdot v0.2s, v1.8b, v2.4b[0]
%0 = bitcast <16 x i8> %b to <4 x i32>
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -66,9 +84,11 @@ entry:
}
define <2 x i32> @sudot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: sudot_lane.v2i32.v16i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sudot v0.2s, v1.8b, v2.4b[0]
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: sudot_lane.v2i32.v16i8
-; CHECK: sudot v0.2s, v1.8b, v2.4b[0]
%0 = bitcast <16 x i8> %b to <4 x i32>
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer
%1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -77,17 +97,22 @@ entry:
}
define <4 x i32> @usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: usdot.v4i32.v16i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: usdot v0.4s, v1.16b, v2.16b
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: usdot.v4i32.v16i8
-; CHECK: usdot v0.4s, v1.16b, v2.16b
%vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
ret <4 x i32> %vusdot1.i
}
define <4 x i32> @usdot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: usdot_lane.v4i32.v16i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT: usdot v0.4s, v1.16b, v2.4b[0]
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: usdot_lane.v4i32.v16i8
-; CHECK: usdot v0.4s, v1.16b, v2.4b[0]
%0 = bitcast <8 x i8> %b to <2 x i32>
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -96,9 +121,12 @@ entry:
}
define <4 x i32> @sudot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: sudot_lane.v4i32.v16i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT: sudot v0.4s, v1.16b, v2.4b[0]
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: sudot_lane.v4i32.v16i8
-; CHECK: sudot v0.4s, v1.16b, v2.4b[0]
%0 = bitcast <8 x i8> %b to <2 x i32>
%shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -107,9 +135,11 @@ entry:
}
define <4 x i32> @usdot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: usdot_laneq.v4i32.v16i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: usdot v0.4s, v1.16b, v2.4b[0]
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: usdot_laneq.v4i32.v16i8
-; CHECK: usdot v0.4s, v1.16b, v2.4b[0]
%0 = bitcast <16 x i8> %b to <4 x i32>
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -118,9 +148,11 @@ entry:
}
define <4 x i32> @sudot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: sudot_laneq.v4i32.v16i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sudot v0.4s, v1.16b, v2.4b[0]
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: sudot_laneq.v4i32.v16i8
-; CHECK: sudot v0.4s, v1.16b, v2.4b[0]
%0 = bitcast <16 x i8> %b to <4 x i32>
%shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
%1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -134,3 +166,6 @@ declare <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <1
declare <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) #2
declare <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-GI: {{.*}}
+; CHECK-SD: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/arm64-vadd.ll b/llvm/test/CodeGen/AArch64/arm64-vadd.ll
index 11fb73237da07..e3c80256feea0 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vadd.ll
@@ -1,9 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple=arm64-eabi | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc < %s -mtriple=arm64-eabi -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-
-; CHECK-GI: warning: Instruction selection used fallback path for saddlp1d
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for uaddlp1d
+; RUN: llc < %s -mtriple=arm64-eabi -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
define <8 x i8> @addhn8b(ptr %A, ptr %B) nounwind {
; CHECK-LABEL: addhn8b:
|
eacff9e
to
1ba9bc6
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good to me - thanks for the fix :)
@@ -1,41 +1,54 @@ | |||
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s -o -| FileCheck %s | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 | |||
; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No need for these --check-prefixes
parameters if the tests no longer diverge.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Resolved, let me know if there are any issues 👍
GlobalISel now selects usdot intrinsic, without falling back to SDAG.
1ba9bc6
to
7f07643
Compare
GlobalISel now selects usdot intrinsic, without falling back to SDAG.