Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AArch64] Combine concat(binop, binop) into binop(concat, concat) #89911

Merged
merged 1 commit into from
Apr 25, 2024

Conversation

davemgreen
Copy link
Collaborator

This generalizes the existing combine for concat(radd, radd) to any binops. For much the same reason as the existing code, pushing the concat up through the tree are hopefully quicker (or the same) as the existing two half-vector operations, and can help combine away the concat.

This generalizes the existing combine for concat(radd, radd) to any binops. For
much the same reason as the existing code, pushing the concat up through the
tree are hopefully quicker (or the same) as the existing two half-vector
operations, and can help combine away the concat.
@llvmbot
Copy link
Collaborator

llvmbot commented Apr 24, 2024

@llvm/pr-subscribers-backend-aarch64

Author: David Green (davemgreen)

Changes

This generalizes the existing combine for concat(radd, radd) to any binops. For much the same reason as the existing code, pushing the concat up through the tree are hopefully quicker (or the same) as the existing two half-vector operations, and can help combine away the concat.


Full diff: https://github.com/llvm/llvm-project/pull/89911.diff

3 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+5-7)
  • (modified) llvm/test/CodeGen/AArch64/concatbinop.ll (+50-31)
  • (modified) llvm/test/CodeGen/AArch64/vecreduce-add.ll (+8-6)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2238015e43b173..16e3b5023b4f82 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18647,14 +18647,12 @@ static SDValue performConcatVectorsCombine(SDNode *N,
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
-  // Optimise concat_vectors of two [us]avgceils or [us]avgfloors with a 128-bit
-  // destination size, combine into an avg of two contacts of the source
-  // vectors. eg: concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c),
-  // concat(b, d))
+  // Optimise concat_vectors of two identical binops with a 128-bit destination
+  // size, combine into an binop of two contacts of the source vectors. eg:
+  // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
   if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
-      (N0Opc == ISD::AVGCEILU || N0Opc == ISD::AVGCEILS ||
-       N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS) &&
-      N0->hasOneUse() && N1->hasOneUse()) {
+      DAG.getTargetLoweringInfo().isBinOp(N0Opc) && N0->hasOneUse() &&
+      N1->hasOneUse()) {
     SDValue N00 = N0->getOperand(0);
     SDValue N01 = N0->getOperand(1);
     SDValue N10 = N1->getOperand(0);
diff --git a/llvm/test/CodeGen/AArch64/concatbinop.ll b/llvm/test/CodeGen/AArch64/concatbinop.ll
index a13e62e0612cc0..828182d18b38ce 100644
--- a/llvm/test/CodeGen/AArch64/concatbinop.ll
+++ b/llvm/test/CodeGen/AArch64/concatbinop.ll
@@ -5,9 +5,13 @@
 define <8 x i16> @concat_add(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
 ; CHECK-LABEL: concat_add:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v2.4h, v2.4h, v3.4h
-; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v1.d[1], v3.d[0]
 ; CHECK-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %x = add <4 x i16> %a, %b
   %y = add <4 x i16> %c, %d
@@ -33,13 +37,9 @@ define <8 x i16> @concat_addtunc(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
 define <8 x i16> @concat_addtunc2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
 ; CHECK-LABEL: concat_addtunc2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    xtn v1.4h, v1.4s
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    xtn v2.4h, v2.4s
-; CHECK-NEXT:    xtn v3.4h, v3.4s
-; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    add v1.4h, v2.4h, v3.4h
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    uzp1 v1.8h, v1.8h, v3.8h
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %at = trunc <4 x i32> %a to <4 x i16>
   %bt = trunc <4 x i32> %b to <4 x i16>
@@ -54,9 +54,13 @@ define <8 x i16> @concat_addtunc2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
 define <8 x i16> @concat_sub(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
 ; CHECK-LABEL: concat_sub:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v2.4h, v2.4h, v3.4h
-; CHECK-NEXT:    sub v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v1.d[1], v3.d[0]
 ; CHECK-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-NEXT:    sub v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %x = sub <4 x i16> %a, %b
   %y = sub <4 x i16> %c, %d
@@ -67,9 +71,13 @@ define <8 x i16> @concat_sub(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16>
 define <8 x i16> @concat_mul(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
 ; CHECK-LABEL: concat_mul:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul v2.4h, v2.4h, v3.4h
-; CHECK-NEXT:    mul v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v1.d[1], v3.d[0]
 ; CHECK-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-NEXT:    mul v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %x = mul <4 x i16> %a, %b
   %y = mul <4 x i16> %c, %d
@@ -80,9 +88,13 @@ define <8 x i16> @concat_mul(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16>
 define <8 x i16> @concat_xor(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
 ; CHECK-LABEL: concat_xor:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor v2.8b, v2.8b, v3.8b
-; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v1.d[1], v3.d[0]
 ; CHECK-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %x = xor <4 x i16> %a, %b
   %y = xor <4 x i16> %c, %d
@@ -93,9 +105,13 @@ define <8 x i16> @concat_xor(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16>
 define <8 x half> @concat_fadd(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) {
 ; CHECK-LABEL: concat_fadd:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fadd v2.4h, v2.4h, v3.4h
-; CHECK-NEXT:    fadd v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v1.d[1], v3.d[0]
 ; CHECK-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-NEXT:    fadd v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %x = fadd <4 x half> %a, %b
   %y = fadd <4 x half> %c, %d
@@ -106,9 +122,13 @@ define <8 x half> @concat_fadd(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x
 define <8 x half> @concat_fmul(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) {
 ; CHECK-LABEL: concat_fmul:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmul v2.4h, v2.4h, v3.4h
-; CHECK-NEXT:    fmul v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v1.d[1], v3.d[0]
 ; CHECK-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-NEXT:    fmul v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %x = fmul <4 x half> %a, %b
   %y = fmul <4 x half> %c, %d
@@ -119,9 +139,13 @@ define <8 x half> @concat_fmul(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x
 define <8 x half> @concat_min(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) {
 ; CHECK-LABEL: concat_min:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fminnm v2.4h, v2.4h, v3.4h
-; CHECK-NEXT:    fminnm v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v1.d[1], v3.d[0]
 ; CHECK-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-NEXT:    fminnm v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %x = call <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b)
   %y = call <4 x half> @llvm.minnum.v4f16(<4 x half> %c, <4 x half> %d)
@@ -146,21 +170,16 @@ define <16 x i8> @signOf_neon(ptr nocapture noundef readonly %a, ptr nocapture n
 ; CHECK-LABEL: signOf_neon:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    movi v0.8b, #1
+; CHECK-NEXT:    movi v0.16b, #1
 ; CHECK-NEXT:    ldp q3, q4, [x1]
 ; CHECK-NEXT:    cmhi v5.8h, v1.8h, v3.8h
 ; CHECK-NEXT:    cmhi v6.8h, v2.8h, v4.8h
 ; CHECK-NEXT:    cmhi v1.8h, v3.8h, v1.8h
 ; CHECK-NEXT:    cmhi v2.8h, v4.8h, v2.8h
-; CHECK-NEXT:    xtn v3.8b, v5.8h
-; CHECK-NEXT:    xtn v4.8b, v6.8h
-; CHECK-NEXT:    xtn v1.8b, v1.8h
-; CHECK-NEXT:    xtn v2.8b, v2.8h
-; CHECK-NEXT:    and v3.8b, v3.8b, v0.8b
-; CHECK-NEXT:    and v4.8b, v4.8b, v0.8b
-; CHECK-NEXT:    orr v0.8b, v3.8b, v1.8b
-; CHECK-NEXT:    orr v1.8b, v4.8b, v2.8b
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    uzp1 v3.16b, v5.16b, v6.16b
+; CHECK-NEXT:    uzp1 v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    and v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
 entry:
   %0 = load <8 x i16>, ptr %a, align 2
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 3254c5ebe9c6b1..ab7cea8dfb7789 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -2825,10 +2825,11 @@ entry:
 define i64 @add_pair_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
 ; CHECK-SD-LABEL: add_pair_v2i16_v2i64_zext:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-SD-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-SD-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    movi v2.2d, #0x00ffff0000ffff
 ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-SD-NEXT:    uaddlv d0, v0.4s
 ; CHECK-SD-NEXT:    fmov x0, d0
 ; CHECK-SD-NEXT:    ret
@@ -3578,10 +3579,11 @@ entry:
 define i64 @add_pair_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-SD-LABEL: add_pair_v2i8_v2i64_zext:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    movi d2, #0x0000ff000000ff
-; CHECK-SD-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-SD-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    movi v2.2d, #0x0000ff000000ff
 ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-SD-NEXT:    uaddlv d0, v0.4s
 ; CHECK-SD-NEXT:    fmov x0, d0
 ; CHECK-SD-NEXT:    ret

Copy link
Collaborator

@SamTebbs33 SamTebbs33 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks great to me.

; CHECK-NEXT: mov v0.d[1], v2.d[0]
; CHECK-NEXT: add v0.8h, v0.8h, v1.8h
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My guess is that this isn't faster than the old sequence because of the dependency. The other case below is clearly an improvement, so I guess overall this change is beneficial?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that is the idea. If we change 2 x binop + concat -> binop + 2 x concat, then sometimes 'concat' is cheaper than the 'binop' and so it's a little cheaper overall. But if not it can lead to other simplifications and shouldn't be any worse.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, cheers, LGTM too

@davemgreen davemgreen merged commit 76ea5fe into llvm:main Apr 25, 2024
5 of 6 checks passed
@davemgreen davemgreen deleted the gh-a64-concatbinop branch April 25, 2024 11:12
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

4 participants