-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Handle v2i16 and v2i8 in concat load combine. #86264
Conversation
This extends the concat load patch from https://reviews.llvm.org/D121400, which was later moved to a combine, to handle v2i8 and v2i16 concat loads too.
@llvm/pr-subscribers-backend-aarch64 Author: David Green (davemgreen) ChangesThis extends the concat load patch from https://reviews.llvm.org/D121400, which was later moved to a combine, to handle v2i8 and v2i16 concat loads too. Full diff: https://github.com/llvm/llvm-project/pull/86264.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7fab274ab957c8..043a617898a69e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18629,12 +18629,16 @@ static SDValue performConcatVectorsCombine(SDNode *N,
}
}
- if (N->getOperand(0).getValueType() == MVT::v4i8) {
+ if (N->getOperand(0).getValueType() == MVT::v4i8 ||
+ N->getOperand(0).getValueType() == MVT::v2i16 ||
+ N->getOperand(0).getValueType() == MVT::v2i8) {
+ EVT SrcVT = N->getOperand(0).getValueType();
// If we have a concat of v4i8 loads, convert them to a buildvector of f32
// loads to prevent having to go through the v4i8 load legalization that
// needs to extend each element into a larger type.
- if (N->getNumOperands() % 2 == 0 && all_of(N->op_values(), [](SDValue V) {
- if (V.getValueType() != MVT::v4i8)
+ if (N->getNumOperands() % 2 == 0 &&
+ all_of(N->op_values(), [SrcVT](SDValue V) {
+ if (V.getValueType() != SrcVT)
return false;
if (V.isUndef())
return true;
@@ -18642,19 +18646,18 @@ static SDValue performConcatVectorsCombine(SDNode *N,
return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
LD->getExtensionType() == ISD::NON_EXTLOAD;
})) {
- EVT NVT =
- EVT::getVectorVT(*DAG.getContext(), MVT::f32, N->getNumOperands());
+ EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
+ EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
SmallVector<SDValue> Ops;
for (unsigned i = 0; i < N->getNumOperands(); i++) {
SDValue V = N->getOperand(i);
if (V.isUndef())
- Ops.push_back(DAG.getUNDEF(MVT::f32));
+ Ops.push_back(DAG.getUNDEF(FVT));
else {
LoadSDNode *LD = cast<LoadSDNode>(V);
- SDValue NewLoad =
- DAG.getLoad(MVT::f32, dl, LD->getChain(), LD->getBasePtr(),
- LD->getMemOperand());
+ SDValue NewLoad = DAG.getLoad(FVT, dl, LD->getChain(),
+ LD->getBasePtr(), LD->getMemOperand());
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
Ops.push_back(NewLoad);
}
diff --git a/llvm/test/CodeGen/AArch64/insert-subvector.ll b/llvm/test/CodeGen/AArch64/insert-subvector.ll
index 95ad9807ed6390..6828fa9f1508c8 100644
--- a/llvm/test/CodeGen/AArch64/insert-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/insert-subvector.ll
@@ -377,12 +377,8 @@ define <16 x i8> @load_v16i8_8_2(float %tmp, <16 x i8> %b, ptr %a) {
define <8 x i8> @load_v8i8_2_1(float %tmp, <8 x i8> %b, ptr %a) {
; CHECK-LABEL: load_v8i8_2_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1 { v2.b }[0], [x0]
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: mov v0.16b, v2.16b
-; CHECK-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-NEXT: mov v2.b[1], v0.b[4]
; CHECK-NEXT: fmov d0, d1
+; CHECK-NEXT: ldr h2, [x0]
; CHECK-NEXT: mov v0.h[0], v2.h[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
@@ -395,12 +391,9 @@ define <8 x i8> @load_v8i8_2_1(float %tmp, <8 x i8> %b, ptr %a) {
define <8 x i8> @load_v8i8_2_15(float %tmp, <8 x i8> %b, ptr %a) {
; CHECK-LABEL: load_v8i8_2_15:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-NEXT: add x8, x0, #1
+; CHECK-NEXT: ldr h0, [x0]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-NEXT: adrp x8, .LCPI33_0
-; CHECK-NEXT: mov v0.b[1], v0.b[4]
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI33_0]
; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b
@@ -414,12 +407,8 @@ define <8 x i8> @load_v8i8_2_15(float %tmp, <8 x i8> %b, ptr %a) {
define <8 x i8> @load_v8i8_2_2(float %tmp, <8 x i8> %b, ptr %a) {
; CHECK-LABEL: load_v8i8_2_2:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1 { v2.b }[0], [x0]
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: mov v0.16b, v2.16b
-; CHECK-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-NEXT: mov v2.b[1], v0.b[4]
; CHECK-NEXT: fmov d0, d1
+; CHECK-NEXT: ldr h2, [x0]
; CHECK-NEXT: mov v0.h[1], v2.h[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
@@ -432,12 +421,8 @@ define <8 x i8> @load_v8i8_2_2(float %tmp, <8 x i8> %b, ptr %a) {
define <8 x i8> @load_v8i8_2_3(float %tmp, <8 x i8> %b, ptr %a) {
; CHECK-LABEL: load_v8i8_2_3:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1 { v2.b }[0], [x0]
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: mov v0.16b, v2.16b
-; CHECK-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-NEXT: mov v2.b[1], v0.b[4]
; CHECK-NEXT: fmov d0, d1
+; CHECK-NEXT: ldr h2, [x0]
; CHECK-NEXT: mov v0.h[2], v2.h[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
@@ -450,12 +435,8 @@ define <8 x i8> @load_v8i8_2_3(float %tmp, <8 x i8> %b, ptr %a) {
define <8 x i8> @load_v8i8_2_4(float %tmp, <8 x i8> %b, ptr %a) {
; CHECK-LABEL: load_v8i8_2_4:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1 { v2.b }[0], [x0]
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: mov v0.16b, v2.16b
-; CHECK-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-NEXT: mov v2.b[1], v0.b[4]
; CHECK-NEXT: fmov d0, d1
+; CHECK-NEXT: ldr h2, [x0]
; CHECK-NEXT: mov v0.h[3], v2.h[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
@@ -468,11 +449,9 @@ define <8 x i8> @load_v8i8_2_4(float %tmp, <8 x i8> %b, ptr %a) {
define <4 x i8> @load_v4i8_2_1(float %tmp, <4 x i8> %b, ptr %a) {
; CHECK-LABEL: load_v4i8_2_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-NEXT: add x8, x0, #1
+; CHECK-NEXT: ldr h0, [x0]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-NEXT: uzp1 v0.4h, v0.4h, v0.4h
+; CHECK-NEXT: zip1 v0.8b, v0.8b, v0.8b
; CHECK-NEXT: mov v0.s[1], v1.s[1]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
@@ -485,10 +464,8 @@ define <4 x i8> @load_v4i8_2_1(float %tmp, <4 x i8> %b, ptr %a) {
define <4 x i8> @load_v4i8_2_2(float %tmp, <4 x i8> %b, ptr %a) {
; CHECK-LABEL: load_v4i8_2_2:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-NEXT: uzp1 v2.4h, v0.4h, v0.4h
+; CHECK-NEXT: ldr h0, [x0]
+; CHECK-NEXT: zip1 v2.8b, v0.8b, v0.8b
; CHECK-NEXT: fmov d0, d1
; CHECK-NEXT: mov v0.s[1], v2.s[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -504,13 +481,8 @@ define <4 x i8> @load_v4i8_2_2(float %tmp, <4 x i8> %b, ptr %a) {
define <8 x i16> @load_v8i16_2_1(float %tmp, <8 x i16> %b, ptr %a) {
; CHECK-LABEL: load_v8i16_2_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrh w8, [x0]
-; CHECK-NEXT: add x9, x0, #2
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: ld1 { v0.h }[2], [x9]
-; CHECK-NEXT: xtn v2.4h, v0.4s
; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: mov v0.s[0], v2.s[0]
+; CHECK-NEXT: ld1 { v0.s }[0], [x0]
; CHECK-NEXT: ret
%l = load <2 x i16>, ptr %a
%s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -521,13 +493,9 @@ define <8 x i16> @load_v8i16_2_1(float %tmp, <8 x i16> %b, ptr %a) {
define <8 x i16> @load_v8i16_2_15(float %tmp, <8 x i16> %b, ptr %a) {
; CHECK-LABEL: load_v8i16_2_15:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrh w8, [x0]
-; CHECK-NEXT: add x9, x0, #2
; CHECK-NEXT: // kill: def $q1 killed $q1 def $q0_q1
-; CHECK-NEXT: fmov s2, w8
; CHECK-NEXT: adrp x8, .LCPI40_0
-; CHECK-NEXT: ld1 { v2.h }[2], [x9]
-; CHECK-NEXT: xtn v0.4h, v2.4s
+; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI40_0]
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-NEXT: ret
@@ -540,13 +508,8 @@ define <8 x i16> @load_v8i16_2_15(float %tmp, <8 x i16> %b, ptr %a) {
define <8 x i16> @load_v8i16_2_2(float %tmp, <8 x i16> %b, ptr %a) {
; CHECK-LABEL: load_v8i16_2_2:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrh w8, [x0]
-; CHECK-NEXT: add x9, x0, #2
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: ld1 { v0.h }[2], [x9]
-; CHECK-NEXT: xtn v2.4h, v0.4s
; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: mov v0.s[1], v2.s[0]
+; CHECK-NEXT: ld1 { v0.s }[1], [x0]
; CHECK-NEXT: ret
%l = load <2 x i16>, ptr %a
%s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -557,13 +520,8 @@ define <8 x i16> @load_v8i16_2_2(float %tmp, <8 x i16> %b, ptr %a) {
define <8 x i16> @load_v8i16_2_3(float %tmp, <8 x i16> %b, ptr %a) {
; CHECK-LABEL: load_v8i16_2_3:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrh w8, [x0]
-; CHECK-NEXT: add x9, x0, #2
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: ld1 { v0.h }[2], [x9]
-; CHECK-NEXT: xtn v2.4h, v0.4s
; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: mov v0.s[2], v2.s[0]
+; CHECK-NEXT: ld1 { v0.s }[2], [x0]
; CHECK-NEXT: ret
%l = load <2 x i16>, ptr %a
%s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -574,13 +532,8 @@ define <8 x i16> @load_v8i16_2_3(float %tmp, <8 x i16> %b, ptr %a) {
define <8 x i16> @load_v8i16_2_4(float %tmp, <8 x i16> %b, ptr %a) {
; CHECK-LABEL: load_v8i16_2_4:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrh w8, [x0]
-; CHECK-NEXT: add x9, x0, #2
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: ld1 { v0.h }[2], [x9]
-; CHECK-NEXT: xtn v2.4h, v0.4s
; CHECK-NEXT: mov v0.16b, v1.16b
-; CHECK-NEXT: mov v0.s[3], v2.s[0]
+; CHECK-NEXT: ld1 { v0.s }[3], [x0]
; CHECK-NEXT: ret
%l = load <2 x i16>, ptr %a
%s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -591,11 +544,8 @@ define <8 x i16> @load_v8i16_2_4(float %tmp, <8 x i16> %b, ptr %a) {
define <4 x i16> @load_v4i16_2_1(float %tmp, <4 x i16> %b, ptr %a) {
; CHECK-LABEL: load_v4i16_2_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-NEXT: add x8, x0, #2
+; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-NEXT: mov v0.s[1], v1.s[1]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
@@ -608,11 +558,8 @@ define <4 x i16> @load_v4i16_2_1(float %tmp, <4 x i16> %b, ptr %a) {
define <4 x i16> @load_v4i16_2_2(float %tmp, <4 x i16> %b, ptr %a) {
; CHECK-LABEL: load_v4i16_2_2:
; CHECK: // %bb.0:
-; CHECK-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-NEXT: add x8, x0, #2
-; CHECK-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-NEXT: uzp1 v2.4h, v0.4h, v0.4h
; CHECK-NEXT: fmov d0, d1
+; CHECK-NEXT: ldr s2, [x0]
; CHECK-NEXT: mov v0.s[1], v2.s[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like a very good improvement.
This extends the concat load patch from https://reviews.llvm.org/D121400, which was later moved to a combine, to handle v2i8 and v2i16 concat loads too.