[AArch64] Remove copy instruction between uaddlv with v8i16 and dup #66068

jaykang10 · 2023-09-12T11:11:19Z

If there are copy instructions between uaddlv with v8i16 and dup for transfer from gpr to fpr, try to remove them with duplane.
It is a follow-up patch of https://reviews.llvm.org/D159267

llvmbot · 2023-09-12T11:12:22Z

@llvm/pr-subscribers-backend-aarch64

Changes

If there are copy instructions between uaddlv with v8i16 and dup for transfer from gpr to fpr, try to remove them with duplane.
It is a follow-up patch of https://reviews.llvm.org/D159267

Full diff: https://github.com/llvm/llvm-project/pull/66068.diff

4 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+2-1)
(modified) llvm/lib/Target/AArch64/AArch64InstrInfo.td (+8)
(modified) llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll (+6-6)
(modified) llvm/test/CodeGen/AArch64/neon-addlv.ll (+18-1)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index fb25dd1d77553fb..510cf33e7007029 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5329,7 +5329,8 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::aarch64_neon_uaddlv: {
     EVT OpVT = Op.getOperand(1).getValueType();
     EVT ResVT = Op.getValueType();
-    if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8)) {
+    if (ResVT == MVT::i32 &&
+        (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 || OpVT == MVT::v8i16)) {
       // In order to avoid insert_subvector, used v4i32 than v2i32.
       SDValue UADDLV =
           DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 82b79cd7232cc90..173b6dd67d20c88 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6077,6 +6077,8 @@ defm : DUPWithTruncPats;
 defm : DUPWithTruncPats;
 defm : DUPWithTruncPats;
 
+defm : DUPWithTruncPats;
+
 multiclass DUPWithTrunci64Pats {
   def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn),
@@ -6472,12 +6474,18 @@ def : Pat<(i32 (int_aarch64_neon_uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op)))
             (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)),
             ssub))>;
 
+def : Pat<(v4i32 (AArch64uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))),
+          (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub))>;
+
 def : Pat<(v4i32 (AArch64uaddlv (v8i8 V64:$Rn))),
           (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i8v V64:$Rn), hsub))>;
 
 def : Pat<(v4i32 (AArch64uaddlv (v16i8 V128:$Rn))),
           (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$Rn), hsub))>;
 
+def : Pat<(v4i32 (AArch64uaddlv (v8i16 V128:$Rn))),
+          (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$Rn), ssub))>;
+
 // Patterns for across-vector intrinsics, that have a node equivalent, that
 // returns a vector (with only the low lane defined) instead of a scalar.
 // In effect, opNode is the same as (scalar_to_vector (IntNode)).
diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
index bf420700eb575fb..ccc7a2d9ebac4c2 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
@@ -14,8 +14,8 @@ define void @insert_vec_v2i32_uaddlv_from_v8i16(ptr %0) {
 ; CHECK-NEXT:    movi.2d v1, #0000000000000000
 ; CHECK-NEXT:    uaddlv.8h s0, v0
 ; CHECK-NEXT:    mov.s v1[0], v0[0]
-; CHECK-NEXT:    ucvtf.2s v1, v1
-; CHECK-NEXT:    str d1, [x0]
+; CHECK-NEXT:    ucvtf.2s v0, v1
+; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
@@ -52,8 +52,8 @@ define void @insert_vec_v16i32_uaddlv_from_v8i16(ptr %0) {
 ; CHECK-NEXT:    uaddlv.8h s1, v0
 ; CHECK-NEXT:    stp q0, q0, [x0, #32]
 ; CHECK-NEXT:    mov.s v2[0], v1[0]
-; CHECK-NEXT:    ucvtf.4s v2, v2
-; CHECK-NEXT:    stp q2, q0, [x0]
+; CHECK-NEXT:    ucvtf.4s v1, v2
+; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
@@ -76,8 +76,8 @@ define void @insert_vec_v23i32_uaddlv_from_v8i16(ptr %0) {
 ; CHECK-NEXT:    st1.s { v0 }[2], [x8]
 ; CHECK-NEXT:    str d0, [x0, #80]
 ; CHECK-NEXT:    mov.s v2[0], v1[0]
-; CHECK-NEXT:    ucvtf.4s v2, v2
-; CHECK-NEXT:    str q2, [x0]
+; CHECK-NEXT:    ucvtf.4s v1, v2
+; CHECK-NEXT:    str q1, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
diff --git a/llvm/test/CodeGen/AArch64/neon-addlv.ll b/llvm/test/CodeGen/AArch64/neon-addlv.ll
index 1b037c13aa4b546..7cc1d2e6647ab96 100644
--- a/llvm/test/CodeGen/AArch64/neon-addlv.ll
+++ b/llvm/test/CodeGen/AArch64/neon-addlv.ll
@@ -194,8 +194,23 @@ entry:
   ret <8 x i8> %vrshrn_n2
 }
 
-declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
+define dso_local <8 x i16> @uaddlv_dup_v8i16(<8 x i16> %a) {
+; CHECK-LABEL: uaddlv_dup_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uaddlv s0, v0.8h
+; CHECK-NEXT:    dup v1.8h, v0.h[0]
+; CHECK-NEXT:    rshrn v0.4h, v1.4s, #3
+; CHECK-NEXT:    rshrn2 v0.8h, v1.4s, #3
+; CHECK-NEXT:    ret
+entry:
+  %vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> %a)
+  %vecinit.i = insertelement <8 x i32> undef, i32 %vaddlv.i, i64 0
+  %vecinit7.i = shufflevector <8 x i32> %vecinit.i, <8 x i32> poison, <8 x i32> zeroinitializer
+  %vrshrn_n2 = tail call <8 x i16> @llvm.aarch64.neon.rshrn.v8i16(<8 x i32> %vecinit7.i, i32 3)
+  ret <8 x i16> %vrshrn_n2
+}
 
+declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
 declare i64 @llvm.aarch64.neon.urshl.i64(i64, i64)
 
 define <8 x i8> @uaddlv_v8i8_urshr(<8 x i8> %a) {
@@ -215,3 +230,5 @@ entry:
   %vecinit7.i = shufflevector <8 x i8> %vecinit.i, <8 x i8> poison, <8 x i32> zeroinitializer
   ret <8 x i8> %vecinit7.i
 }
+declare <8 x i16> @llvm.aarch64.neon.rshrn.v8i16(<8 x i32>, i32)
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16>)

davemgreen · 2023-09-14T09:26:24Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

@@ -5329,7 +5329,8 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
  case Intrinsic::aarch64_neon_uaddlv: {
    EVT OpVT = Op.getOperand(1).getValueType();
    EVT ResVT = Op.getValueType();
-    if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8)) {
+    if (ResVT == MVT::i32 &&
+        (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 || OpVT == MVT::v8i16)) {


Should this add v4i16 too?

davemgreen · 2023-09-14T09:39:28Z

llvm/lib/Target/AArch64/AArch64InstrInfo.td

@@ -6077,6 +6077,8 @@ defm : DUPWithTruncPats<v16i8,  v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
 defm : DUPWithTruncPats<v16i8,  v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
 defm : DUPWithTruncPats<v8i16,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;

+defm : DUPWithTruncPats<v4i32,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;


There isn't really an trunc going on here, If I'm understanding what is going on. Can we add a DAG combine for

t38: i32 = extract_vector_elt t36, Constant:i64<0> t35: v4i32 = AArch64ISD::DUP t38

We should be able to turn that into a AArch64ISD::DUPLANE, and it should be generally useful to do so I believe.

davemgreen · 2023-09-14T09:41:49Z

llvm/test/CodeGen/AArch64/neon-addlv.ll

+; CHECK-LABEL: uaddlv_dup_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uaddlv s0, v0.8h
+; CHECK-NEXT:    dup v1.8h, v0.h[0]


I think the .8h might be incorrect here?

davemgreen · 2023-09-14T09:45:07Z

llvm/test/CodeGen/AArch64/neon-addlv.ll

+  %vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> %a)
+  %vecinit.i = insertelement <8 x i32> undef, i32 %vaddlv.i, i64 0
+  %vecinit7.i = shufflevector <8 x i32> %vecinit.i, <8 x i32> poison, <8 x i32> zeroinitializer
+  %vrshrn_n2 = tail call <8 x i16> @llvm.aarch64.neon.rshrn.v8i16(<8 x i32> %vecinit7.i, i32 3)


This isn't a valid neon intrinsic - they need to legal vector sizes for the inputs and outputs. I think it works in this case because it gets expanded to shifts and whatnot. Is there another instruction that could be used in it's place for the test? Maybe just a simple shift?

jaykang10 · 2023-09-15T07:14:41Z

@davemgreen sorry... It looks I made a mistake... I did not get notification for this pull request...
Let me update code according to your comments.

jaykang10 requested review from TNorthover, davemgreen and efriedma-quic September 12, 2023 11:11

jaykang10 requested a review from a team as a code owner September 12, 2023 11:11

llvmbot added the backend:AArch64 label Sep 12, 2023

davemgreen reviewed Sep 14, 2023

View reviewed changes

jaykang10 closed this Sep 15, 2023

jaykang10 force-pushed the main branch from fb30a5d to 5d3489e Compare September 15, 2023 07:07

jaykang10 removed request for TNorthover and efriedma-quic September 15, 2023 07:30

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AArch64] Remove copy instruction between uaddlv with v8i16 and dup #66068

[AArch64] Remove copy instruction between uaddlv with v8i16 and dup #66068

jaykang10 commented Sep 12, 2023

llvmbot commented Sep 12, 2023

If there are copy instructions between uaddlv with v8i16 and dup for transfer from gpr to fpr, try to remove them with duplane.
It is a follow-up patch of https://reviews.llvm.org/D159267

davemgreen Sep 14, 2023

davemgreen Sep 14, 2023

davemgreen Sep 14, 2023

davemgreen Sep 14, 2023

jaykang10 commented Sep 15, 2023

[AArch64] Remove copy instruction between uaddlv with v8i16 and dup #66068

[AArch64] Remove copy instruction between uaddlv with v8i16 and dup #66068

Conversation

jaykang10 commented Sep 12, 2023

llvmbot commented Sep 12, 2023

If there are copy instructions between uaddlv with v8i16 and dup for transfer from gpr to fpr, try to remove them with duplane. It is a follow-up patch of https://reviews.llvm.org/D159267

davemgreen Sep 14, 2023

Choose a reason for hiding this comment

davemgreen Sep 14, 2023

Choose a reason for hiding this comment

davemgreen Sep 14, 2023

Choose a reason for hiding this comment

davemgreen Sep 14, 2023

Choose a reason for hiding this comment

jaykang10 commented Sep 15, 2023

If there are copy instructions between uaddlv with v8i16 and dup for transfer from gpr to fpr, try to remove them with duplane.
It is a follow-up patch of https://reviews.llvm.org/D159267