Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DAGCombiner] Avoid the pre-truncate of BUILD_VECTOR sources. #75792

Merged
merged 3 commits into from
Dec 19, 2023

Conversation

Rin18
Copy link
Contributor

@Rin18 Rin18 commented Dec 18, 2023

Avoid the pre-truncate of BUILD_VECTOR sources when there is more than one use. This can avoid using unnecessary movs later down the instruction selection pipeline.

@llvmbot llvmbot added backend:AArch64 llvm:SelectionDAG SelectionDAGISel as well labels Dec 18, 2023
@llvmbot
Copy link
Collaborator

llvmbot commented Dec 18, 2023

@llvm/pr-subscribers-backend-aarch64

Author: Rin (Rin18)

Changes

Avoid the pre-truncate of BUILD_VECTOR sources when there is more than one use. This can avoid using unnecessary movs later down the instruction selection pipeline.


Full diff: https://github.com/llvm/llvm-project/pull/75792.diff

2 Files Affected:

  • (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+1)
  • (added) llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll (+89)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1d4bfa6fde0352..02d2bb41b051db 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14759,6 +14759,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
 
   // Attempt to pre-truncate BUILD_VECTOR sources.
   if (N0.getOpcode() == ISD::BUILD_VECTOR && !LegalOperations &&
+      N0.hasOneUse() &&
       TLI.isTruncateFree(SrcVT.getScalarType(), VT.getScalarType()) &&
       // Avoid creating illegal types if running after type legalizer.
       (!LegalTypes || TLI.isTypeLegal(VT.getScalarType()))) {
diff --git a/llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll b/llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll
new file mode 100644
index 00000000000000..072694b36549cb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+define i32 @lower_lshr(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h) local_unnamed_addr #0 {
+; CHECK-LABEL: lower_lshr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    addv s1, v1.4s
+; CHECK-NEXT:    addv s4, v4.4s
+; CHECK-NEXT:    addv s5, v5.4s
+; CHECK-NEXT:    addv s2, v2.4s
+; CHECK-NEXT:    addv s6, v6.4s
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    addv s1, v3.4s
+; CHECK-NEXT:    addv s3, v7.4s
+; CHECK-NEXT:    mov v4.s[1], v5.s[0]
+; CHECK-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-NEXT:    mov v4.s[2], v6.s[0]
+; CHECK-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-NEXT:    mov v4.s[3], v3.s[0]
+; CHECK-NEXT:    xtn v2.4h, v0.4s
+; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-NEXT:    xtn v1.4h, v4.4s
+; CHECK-NEXT:    shrn v3.4h, v4.4s, #16
+; CHECK-NEXT:    uhadd v0.4h, v2.4h, v0.4h
+; CHECK-NEXT:    uhadd v1.4h, v1.4h, v3.4h
+; CHECK-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %l87  = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
+  %l174 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b)
+  %l257 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %c)
+  %l340 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %d)
+  %l427 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %e)
+  %l514 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %f)
+  %l597 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %g)
+  %l680 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %h)
+  %l681 = insertelement <8 x i32> poison, i32 %l87, i32 0
+  %l682 = insertelement <8 x i32> %l681, i32 %l174, i32 1
+  %l683 = insertelement <8 x i32> %l682, i32 %l257, i32 2
+  %l684 = insertelement <8 x i32> %l683, i32 %l340, i32 3
+  %l685 = insertelement <8 x i32> %l684, i32 %l427, i32 4
+  %l686 = insertelement <8 x i32> %l685, i32 %l514, i32 5
+  %l687 = insertelement <8 x i32> %l686, i32 %l597, i32 6
+  %l688 = insertelement <8 x i32> %l687, i32 %l680, i32 7
+  %l689 = and <8 x i32> %l688, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %l690 = lshr <8 x i32> %l688, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %l691 = add nuw nsw <8 x i32> %l689, %l690
+  %l692 = lshr <8 x i32> %l691, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %l693 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %l692)
+  ret i32 %l693
+}
+
+define <8 x i16> @lower_trunc(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) local_unnamed_addr #0 {
+; CHECK-LABEL: lower_trunc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w4
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    mov v0.s[1], w5
+; CHECK-NEXT:    mov v1.s[1], w1
+; CHECK-NEXT:    mov v0.s[2], w6
+; CHECK-NEXT:    mov v1.s[2], w2
+; CHECK-NEXT:    mov v0.s[3], w7
+; CHECK-NEXT:    mov v1.s[3], w3
+; CHECK-NEXT:    add v2.4s, v0.4s, v0.4s
+; CHECK-NEXT:    add v3.4s, v1.4s, v1.4s
+; CHECK-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    uzp1 v1.8h, v3.8h, v2.8h
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %a1 = insertelement <8 x i32> poison, i32 %a, i32 0
+  %b1 = insertelement <8 x i32> %a1, i32 %b, i32 1
+  %c1 = insertelement <8 x i32> %b1, i32 %c, i32 2
+  %d1 = insertelement <8 x i32> %c1, i32 %d, i32 3
+  %e1 = insertelement <8 x i32> %d1, i32 %e, i32 4
+  %f1 = insertelement <8 x i32> %e1, i32 %f, i32 5
+  %g1 = insertelement <8 x i32> %f1, i32 %g, i32 6
+  %h1 = insertelement <8 x i32> %g1, i32 %h, i32 7
+  %t = trunc <8 x i32> %h1 to <8 x i16>
+  %s = add <8 x i32> %h1, %h1
+  %t2 = trunc <8 x i32> %s to <8 x i16>
+  %o = xor <8 x i16> %t, %t2
+  ret <8 x i16> %o
+}
+
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)

@llvmbot
Copy link
Collaborator

llvmbot commented Dec 18, 2023

@llvm/pr-subscribers-llvm-selectiondag

Author: Rin (Rin18)

Changes

Avoid the pre-truncate of BUILD_VECTOR sources when there is more than one use. This can avoid using unnecessary movs later down the instruction selection pipeline.


Full diff: https://github.com/llvm/llvm-project/pull/75792.diff

2 Files Affected:

  • (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+1)
  • (added) llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll (+89)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1d4bfa6fde0352..02d2bb41b051db 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14759,6 +14759,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
 
   // Attempt to pre-truncate BUILD_VECTOR sources.
   if (N0.getOpcode() == ISD::BUILD_VECTOR && !LegalOperations &&
+      N0.hasOneUse() &&
       TLI.isTruncateFree(SrcVT.getScalarType(), VT.getScalarType()) &&
       // Avoid creating illegal types if running after type legalizer.
       (!LegalTypes || TLI.isTypeLegal(VT.getScalarType()))) {
diff --git a/llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll b/llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll
new file mode 100644
index 00000000000000..072694b36549cb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+define i32 @lower_lshr(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h) local_unnamed_addr #0 {
+; CHECK-LABEL: lower_lshr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    addv s1, v1.4s
+; CHECK-NEXT:    addv s4, v4.4s
+; CHECK-NEXT:    addv s5, v5.4s
+; CHECK-NEXT:    addv s2, v2.4s
+; CHECK-NEXT:    addv s6, v6.4s
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    addv s1, v3.4s
+; CHECK-NEXT:    addv s3, v7.4s
+; CHECK-NEXT:    mov v4.s[1], v5.s[0]
+; CHECK-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-NEXT:    mov v4.s[2], v6.s[0]
+; CHECK-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-NEXT:    mov v4.s[3], v3.s[0]
+; CHECK-NEXT:    xtn v2.4h, v0.4s
+; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-NEXT:    xtn v1.4h, v4.4s
+; CHECK-NEXT:    shrn v3.4h, v4.4s, #16
+; CHECK-NEXT:    uhadd v0.4h, v2.4h, v0.4h
+; CHECK-NEXT:    uhadd v1.4h, v1.4h, v3.4h
+; CHECK-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %l87  = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
+  %l174 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b)
+  %l257 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %c)
+  %l340 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %d)
+  %l427 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %e)
+  %l514 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %f)
+  %l597 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %g)
+  %l680 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %h)
+  %l681 = insertelement <8 x i32> poison, i32 %l87, i32 0
+  %l682 = insertelement <8 x i32> %l681, i32 %l174, i32 1
+  %l683 = insertelement <8 x i32> %l682, i32 %l257, i32 2
+  %l684 = insertelement <8 x i32> %l683, i32 %l340, i32 3
+  %l685 = insertelement <8 x i32> %l684, i32 %l427, i32 4
+  %l686 = insertelement <8 x i32> %l685, i32 %l514, i32 5
+  %l687 = insertelement <8 x i32> %l686, i32 %l597, i32 6
+  %l688 = insertelement <8 x i32> %l687, i32 %l680, i32 7
+  %l689 = and <8 x i32> %l688, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %l690 = lshr <8 x i32> %l688, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %l691 = add nuw nsw <8 x i32> %l689, %l690
+  %l692 = lshr <8 x i32> %l691, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %l693 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %l692)
+  ret i32 %l693
+}
+
+define <8 x i16> @lower_trunc(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) local_unnamed_addr #0 {
+; CHECK-LABEL: lower_trunc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w4
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    mov v0.s[1], w5
+; CHECK-NEXT:    mov v1.s[1], w1
+; CHECK-NEXT:    mov v0.s[2], w6
+; CHECK-NEXT:    mov v1.s[2], w2
+; CHECK-NEXT:    mov v0.s[3], w7
+; CHECK-NEXT:    mov v1.s[3], w3
+; CHECK-NEXT:    add v2.4s, v0.4s, v0.4s
+; CHECK-NEXT:    add v3.4s, v1.4s, v1.4s
+; CHECK-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    uzp1 v1.8h, v3.8h, v2.8h
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %a1 = insertelement <8 x i32> poison, i32 %a, i32 0
+  %b1 = insertelement <8 x i32> %a1, i32 %b, i32 1
+  %c1 = insertelement <8 x i32> %b1, i32 %c, i32 2
+  %d1 = insertelement <8 x i32> %c1, i32 %d, i32 3
+  %e1 = insertelement <8 x i32> %d1, i32 %e, i32 4
+  %f1 = insertelement <8 x i32> %e1, i32 %f, i32 5
+  %g1 = insertelement <8 x i32> %f1, i32 %g, i32 6
+  %h1 = insertelement <8 x i32> %g1, i32 %h, i32 7
+  %t = trunc <8 x i32> %h1 to <8 x i16>
+  %s = add <8 x i32> %h1, %h1
+  %t2 = trunc <8 x i32> %s to <8 x i16>
+  %o = xor <8 x i16> %t, %t2
+  ret <8 x i16> %o
+}
+
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)

Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM - cheers

@Rin18 Rin18 merged commit 0894c2e into llvm:main Dec 19, 2023
4 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:AArch64 llvm:SelectionDAG SelectionDAGISel as well
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

6 participants