Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AArch64] Handle v2i16 and v2i8 in concat load combine. #86264

Merged
merged 1 commit into from
Mar 25, 2024

Conversation

davemgreen
Copy link
Collaborator

This extends the concat load patch from https://reviews.llvm.org/D121400, which was later moved to a combine, to handle v2i8 and v2i16 concat loads too.

This extends the concat load patch from https://reviews.llvm.org/D121400, which
was later moved to a combine, to handle v2i8 and v2i16 concat loads too.
@llvmbot
Copy link
Collaborator

llvmbot commented Mar 22, 2024

@llvm/pr-subscribers-backend-aarch64

Author: David Green (davemgreen)

Changes

This extends the concat load patch from https://reviews.llvm.org/D121400, which was later moved to a combine, to handle v2i8 and v2i16 concat loads too.


Full diff: https://github.com/llvm/llvm-project/pull/86264.diff

2 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+12-9)
  • (modified) llvm/test/CodeGen/AArch64/insert-subvector.ll (+16-69)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7fab274ab957c8..043a617898a69e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18629,12 +18629,16 @@ static SDValue performConcatVectorsCombine(SDNode *N,
     }
   }
 
-  if (N->getOperand(0).getValueType() == MVT::v4i8) {
+  if (N->getOperand(0).getValueType() == MVT::v4i8 ||
+      N->getOperand(0).getValueType() == MVT::v2i16 ||
+      N->getOperand(0).getValueType() == MVT::v2i8) {
+    EVT SrcVT = N->getOperand(0).getValueType();
     // If we have a concat of v4i8 loads, convert them to a buildvector of f32
     // loads to prevent having to go through the v4i8 load legalization that
     // needs to extend each element into a larger type.
-    if (N->getNumOperands() % 2 == 0 && all_of(N->op_values(), [](SDValue V) {
-          if (V.getValueType() != MVT::v4i8)
+    if (N->getNumOperands() % 2 == 0 &&
+        all_of(N->op_values(), [SrcVT](SDValue V) {
+          if (V.getValueType() != SrcVT)
             return false;
           if (V.isUndef())
             return true;
@@ -18642,19 +18646,18 @@ static SDValue performConcatVectorsCombine(SDNode *N,
           return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
                  LD->getExtensionType() == ISD::NON_EXTLOAD;
         })) {
-      EVT NVT =
-          EVT::getVectorVT(*DAG.getContext(), MVT::f32, N->getNumOperands());
+      EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
+      EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
       SmallVector<SDValue> Ops;
 
       for (unsigned i = 0; i < N->getNumOperands(); i++) {
         SDValue V = N->getOperand(i);
         if (V.isUndef())
-          Ops.push_back(DAG.getUNDEF(MVT::f32));
+          Ops.push_back(DAG.getUNDEF(FVT));
         else {
           LoadSDNode *LD = cast<LoadSDNode>(V);
-          SDValue NewLoad =
-              DAG.getLoad(MVT::f32, dl, LD->getChain(), LD->getBasePtr(),
-                          LD->getMemOperand());
+          SDValue NewLoad = DAG.getLoad(FVT, dl, LD->getChain(),
+                                        LD->getBasePtr(), LD->getMemOperand());
           DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
           Ops.push_back(NewLoad);
         }
diff --git a/llvm/test/CodeGen/AArch64/insert-subvector.ll b/llvm/test/CodeGen/AArch64/insert-subvector.ll
index 95ad9807ed6390..6828fa9f1508c8 100644
--- a/llvm/test/CodeGen/AArch64/insert-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/insert-subvector.ll
@@ -377,12 +377,8 @@ define <16 x i8> @load_v16i8_8_2(float %tmp, <16 x i8> %b, ptr %a) {
 define <8 x i8> @load_v8i8_2_1(float %tmp, <8 x i8> %b, ptr %a) {
 ; CHECK-LABEL: load_v8i8_2_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1 { v2.b }[0], [x0]
-; CHECK-NEXT:    add x8, x0, #1
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-NEXT:    mov v2.b[1], v0.b[4]
 ; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ldr h2, [x0]
 ; CHECK-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
@@ -395,12 +391,9 @@ define <8 x i8> @load_v8i8_2_1(float %tmp, <8 x i8> %b, ptr %a) {
 define <8 x i8> @load_v8i8_2_15(float %tmp, <8 x i8> %b, ptr %a) {
 ; CHECK-LABEL: load_v8i8_2_15:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-NEXT:    add x8, x0, #1
+; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    ld1 { v0.b }[4], [x8]
 ; CHECK-NEXT:    adrp x8, .LCPI33_0
-; CHECK-NEXT:    mov v0.b[1], v0.b[4]
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI33_0]
 ; CHECK-NEXT:    tbl v0.8b, { v0.16b }, v1.8b
@@ -414,12 +407,8 @@ define <8 x i8> @load_v8i8_2_15(float %tmp, <8 x i8> %b, ptr %a) {
 define <8 x i8> @load_v8i8_2_2(float %tmp, <8 x i8> %b, ptr %a) {
 ; CHECK-LABEL: load_v8i8_2_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1 { v2.b }[0], [x0]
-; CHECK-NEXT:    add x8, x0, #1
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-NEXT:    mov v2.b[1], v0.b[4]
 ; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ldr h2, [x0]
 ; CHECK-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
@@ -432,12 +421,8 @@ define <8 x i8> @load_v8i8_2_2(float %tmp, <8 x i8> %b, ptr %a) {
 define <8 x i8> @load_v8i8_2_3(float %tmp, <8 x i8> %b, ptr %a) {
 ; CHECK-LABEL: load_v8i8_2_3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1 { v2.b }[0], [x0]
-; CHECK-NEXT:    add x8, x0, #1
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-NEXT:    mov v2.b[1], v0.b[4]
 ; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ldr h2, [x0]
 ; CHECK-NEXT:    mov v0.h[2], v2.h[0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
@@ -450,12 +435,8 @@ define <8 x i8> @load_v8i8_2_3(float %tmp, <8 x i8> %b, ptr %a) {
 define <8 x i8> @load_v8i8_2_4(float %tmp, <8 x i8> %b, ptr %a) {
 ; CHECK-LABEL: load_v8i8_2_4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1 { v2.b }[0], [x0]
-; CHECK-NEXT:    add x8, x0, #1
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-NEXT:    mov v2.b[1], v0.b[4]
 ; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ldr h2, [x0]
 ; CHECK-NEXT:    mov v0.h[3], v2.h[0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
@@ -468,11 +449,9 @@ define <8 x i8> @load_v8i8_2_4(float %tmp, <8 x i8> %b, ptr %a) {
 define <4 x i8> @load_v4i8_2_1(float %tmp, <4 x i8> %b, ptr %a) {
 ; CHECK-LABEL: load_v4i8_2_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-NEXT:    add x8, x0, #1
+; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
+; CHECK-NEXT:    zip1 v0.8b, v0.8b, v0.8b
 ; CHECK-NEXT:    mov v0.s[1], v1.s[1]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
@@ -485,10 +464,8 @@ define <4 x i8> @load_v4i8_2_1(float %tmp, <4 x i8> %b, ptr %a) {
 define <4 x i8> @load_v4i8_2_2(float %tmp, <4 x i8> %b, ptr %a) {
 ; CHECK-LABEL: load_v4i8_2_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-NEXT:    add x8, x0, #1
-; CHECK-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-NEXT:    uzp1 v2.4h, v0.4h, v0.4h
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    zip1 v2.8b, v0.8b, v0.8b
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    mov v0.s[1], v2.s[0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -504,13 +481,8 @@ define <4 x i8> @load_v4i8_2_2(float %tmp, <4 x i8> %b, ptr %a) {
 define <8 x i16> @load_v8i16_2_1(float %tmp, <8 x i16> %b, ptr %a) {
 ; CHECK-LABEL: load_v8i16_2_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    add x9, x0, #2
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    ld1 { v0.h }[2], [x9]
-; CHECK-NEXT:    xtn v2.4h, v0.4s
 ; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.s[0], v2.s[0]
+; CHECK-NEXT:    ld1 { v0.s }[0], [x0]
 ; CHECK-NEXT:    ret
   %l = load <2 x i16>, ptr %a
   %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -521,13 +493,9 @@ define <8 x i16> @load_v8i16_2_1(float %tmp, <8 x i16> %b, ptr %a) {
 define <8 x i16> @load_v8i16_2_15(float %tmp, <8 x i16> %b, ptr %a) {
 ; CHECK-LABEL: load_v8i16_2_15:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    add x9, x0, #2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $q0_q1
-; CHECK-NEXT:    fmov s2, w8
 ; CHECK-NEXT:    adrp x8, .LCPI40_0
-; CHECK-NEXT:    ld1 { v2.h }[2], [x9]
-; CHECK-NEXT:    xtn v0.4h, v2.4s
+; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI40_0]
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-NEXT:    ret
@@ -540,13 +508,8 @@ define <8 x i16> @load_v8i16_2_15(float %tmp, <8 x i16> %b, ptr %a) {
 define <8 x i16> @load_v8i16_2_2(float %tmp, <8 x i16> %b, ptr %a) {
 ; CHECK-LABEL: load_v8i16_2_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    add x9, x0, #2
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    ld1 { v0.h }[2], [x9]
-; CHECK-NEXT:    xtn v2.4h, v0.4s
 ; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-NEXT:    ld1 { v0.s }[1], [x0]
 ; CHECK-NEXT:    ret
   %l = load <2 x i16>, ptr %a
   %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -557,13 +520,8 @@ define <8 x i16> @load_v8i16_2_2(float %tmp, <8 x i16> %b, ptr %a) {
 define <8 x i16> @load_v8i16_2_3(float %tmp, <8 x i16> %b, ptr %a) {
 ; CHECK-LABEL: load_v8i16_2_3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    add x9, x0, #2
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    ld1 { v0.h }[2], [x9]
-; CHECK-NEXT:    xtn v2.4h, v0.4s
 ; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-NEXT:    ld1 { v0.s }[2], [x0]
 ; CHECK-NEXT:    ret
   %l = load <2 x i16>, ptr %a
   %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -574,13 +532,8 @@ define <8 x i16> @load_v8i16_2_3(float %tmp, <8 x i16> %b, ptr %a) {
 define <8 x i16> @load_v8i16_2_4(float %tmp, <8 x i16> %b, ptr %a) {
 ; CHECK-LABEL: load_v8i16_2_4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    add x9, x0, #2
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    ld1 { v0.h }[2], [x9]
-; CHECK-NEXT:    xtn v2.4h, v0.4s
 ; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.s[3], v2.s[0]
+; CHECK-NEXT:    ld1 { v0.s }[3], [x0]
 ; CHECK-NEXT:    ret
   %l = load <2 x i16>, ptr %a
   %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -591,11 +544,8 @@ define <8 x i16> @load_v8i16_2_4(float %tmp, <8 x i16> %b, ptr %a) {
 define <4 x i16> @load_v4i16_2_1(float %tmp, <4 x i16> %b, ptr %a) {
 ; CHECK-LABEL: load_v4i16_2_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-NEXT:    add x8, x0, #2
+; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
 ; CHECK-NEXT:    mov v0.s[1], v1.s[1]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
@@ -608,11 +558,8 @@ define <4 x i16> @load_v4i16_2_1(float %tmp, <4 x i16> %b, ptr %a) {
 define <4 x i16> @load_v4i16_2_2(float %tmp, <4 x i16> %b, ptr %a) {
 ; CHECK-LABEL: load_v4i16_2_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-NEXT:    add x8, x0, #2
-; CHECK-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-NEXT:    uzp1 v2.4h, v0.4h, v0.4h
 ; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ldr s2, [x0]
 ; CHECK-NEXT:    mov v0.s[1], v2.s[0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret

Copy link
Collaborator

@sjoerdmeijer sjoerdmeijer left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like a very good improvement.

@davemgreen davemgreen merged commit 96819da into llvm:main Mar 25, 2024
6 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

4 participants