Skip to content

Commit

Permalink
[AArch64] Turn UZP1 with undef operand into truncate
Browse files Browse the repository at this point in the history
This turns upz1(x, undef) to concat(truncate(x), undef), as the truncate
is simpler and can often be optimized away, and it helps some of the
insert-subvector tests optimize more cleanly.

Differential Revision: https://reviews.llvm.org/D120879
  • Loading branch information
davemgreen committed Mar 4, 2022
1 parent 6b5b367 commit e348b09
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 46 deletions.
27 changes: 27 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16179,6 +16179,33 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) {
SDValue Op1 = N->getOperand(1);
EVT ResVT = N->getValueType(0);

// uzp1(x, undef) -> concat(truncate(x), undef)
if (Op1.getOpcode() == ISD::UNDEF) {
EVT BCVT = MVT::Other, HalfVT = MVT::Other;
switch (ResVT.getSimpleVT().SimpleTy) {
default:
break;
case MVT::v16i8:
BCVT = MVT::v8i16;
HalfVT = MVT::v8i8;
break;
case MVT::v8i16:
BCVT = MVT::v4i32;
HalfVT = MVT::v4i16;
break;
case MVT::v4i32:
BCVT = MVT::v2i64;
HalfVT = MVT::v2i32;
break;
}
if (BCVT != MVT::Other) {
SDValue BC = DAG.getBitcast(BCVT, Op0);
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
DAG.getUNDEF(HalfVT));
}
}

// uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
Expand Down
66 changes: 28 additions & 38 deletions llvm/test/CodeGen/AArch64/insert-subvector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -260,11 +260,9 @@ define <4 x i32> @insert_v4i32_2_2(float %tmp, <4 x i32> %b, <4 x i32> %a) {
define <16 x i8> @load_v16i8_4_1(float %tmp, <16 x i8> %b, <4 x i8> *%a) {
; CHECK-LABEL: load_v16i8_4_1:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-NEXT: uzp1 v2.16b, v0.16b, v0.16b
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: mov v0.s[0], v2.s[0]
; CHECK-NEXT: ldr s1, [x0]
; CHECK-NEXT: mov v0.s[0], v1.s[0]
; CHECK-NEXT: ret
%l = load <4 x i8>, <4 x i8> *%a
%s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Expand All @@ -275,11 +273,9 @@ define <16 x i8> @load_v16i8_4_1(float %tmp, <16 x i8> %b, <4 x i8> *%a) {
define <16 x i8> @load_v16i8_4_15(float %tmp, <16 x i8> %b, <4 x i8> *%a) {
; CHECK-LABEL: load_v16i8_4_15:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: adrp x8, .LCPI24_0
; CHECK-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-NEXT: // kill: def $q1 killed $q1 def $q0_q1
; CHECK-NEXT: uzp1 v0.16b, v2.16b, v0.16b
; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_0]
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-NEXT: ret
Expand All @@ -292,11 +288,9 @@ define <16 x i8> @load_v16i8_4_15(float %tmp, <16 x i8> %b, <4 x i8> *%a) {
define <16 x i8> @load_v16i8_4_2(float %tmp, <16 x i8> %b, <4 x i8> *%a) {
; CHECK-LABEL: load_v16i8_4_2:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-NEXT: uzp1 v2.16b, v0.16b, v0.16b
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: mov v0.s[1], v2.s[0]
; CHECK-NEXT: ldr s1, [x0]
; CHECK-NEXT: mov v0.s[1], v1.s[0]
; CHECK-NEXT: ret
%l = load <4 x i8>, <4 x i8> *%a
%s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Expand All @@ -307,11 +301,9 @@ define <16 x i8> @load_v16i8_4_2(float %tmp, <16 x i8> %b, <4 x i8> *%a) {
define <16 x i8> @load_v16i8_4_3(float %tmp, <16 x i8> %b, <4 x i8> *%a) {
; CHECK-LABEL: load_v16i8_4_3:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-NEXT: uzp1 v2.16b, v0.16b, v0.16b
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: mov v0.s[2], v2.s[0]
; CHECK-NEXT: ldr s1, [x0]
; CHECK-NEXT: mov v0.s[2], v1.s[0]
; CHECK-NEXT: ret
%l = load <4 x i8>, <4 x i8> *%a
%s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Expand All @@ -322,11 +314,9 @@ define <16 x i8> @load_v16i8_4_3(float %tmp, <16 x i8> %b, <4 x i8> *%a) {
define <16 x i8> @load_v16i8_4_4(float %tmp, <16 x i8> %b, <4 x i8> *%a) {
; CHECK-LABEL: load_v16i8_4_4:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-NEXT: uzp1 v2.16b, v0.16b, v0.16b
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: mov v0.s[3], v2.s[0]
; CHECK-NEXT: ldr s1, [x0]
; CHECK-NEXT: mov v0.s[3], v1.s[0]
; CHECK-NEXT: ret
%l = load <4 x i8>, <4 x i8> *%a
%s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Expand Down Expand Up @@ -399,11 +389,11 @@ define <8 x i16> @load_v8i16_2_1(float %tmp, <8 x i16> %b, <2 x i16> *%a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: add x8, x0, #2
; CHECK-NEXT: fmov s0, w9
; CHECK-NEXT: ld1 { v0.h }[2], [x8]
; CHECK-NEXT: uzp1 v2.8h, v0.8h, v0.8h
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: mov v0.s[0], v2.s[0]
; CHECK-NEXT: fmov s2, w9
; CHECK-NEXT: ld1 { v2.h }[2], [x8]
; CHECK-NEXT: xtn v1.4h, v2.4s
; CHECK-NEXT: mov v0.s[0], v1.s[0]
; CHECK-NEXT: ret
%l = load <2 x i16>, <2 x i16> *%a
%s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Expand All @@ -420,9 +410,9 @@ define <8 x i16> @load_v8i16_2_15(float %tmp, <8 x i16> %b, <2 x i16> *%a) {
; CHECK-NEXT: fmov s2, w9
; CHECK-NEXT: ld1 { v2.h }[2], [x8]
; CHECK-NEXT: adrp x8, .LCPI33_0
; CHECK-NEXT: uzp1 v0.8h, v2.8h, v0.8h
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI33_0]
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI33_0]
; CHECK-NEXT: xtn v0.4h, v2.4s
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v3.16b
; CHECK-NEXT: ret
%l = load <2 x i16>, <2 x i16> *%a
%s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Expand All @@ -435,11 +425,11 @@ define <8 x i16> @load_v8i16_2_2(float %tmp, <8 x i16> %b, <2 x i16> *%a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: add x8, x0, #2
; CHECK-NEXT: fmov s0, w9
; CHECK-NEXT: ld1 { v0.h }[2], [x8]
; CHECK-NEXT: uzp1 v2.8h, v0.8h, v0.8h
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: mov v0.s[1], v2.s[0]
; CHECK-NEXT: fmov s2, w9
; CHECK-NEXT: ld1 { v2.h }[2], [x8]
; CHECK-NEXT: xtn v1.4h, v2.4s
; CHECK-NEXT: mov v0.s[1], v1.s[0]
; CHECK-NEXT: ret
%l = load <2 x i16>, <2 x i16> *%a
%s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Expand All @@ -452,11 +442,11 @@ define <8 x i16> @load_v8i16_2_3(float %tmp, <8 x i16> %b, <2 x i16> *%a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: add x8, x0, #2
; CHECK-NEXT: fmov s0, w9
; CHECK-NEXT: ld1 { v0.h }[2], [x8]
; CHECK-NEXT: uzp1 v2.8h, v0.8h, v0.8h
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: mov v0.s[2], v2.s[0]
; CHECK-NEXT: fmov s2, w9
; CHECK-NEXT: ld1 { v2.h }[2], [x8]
; CHECK-NEXT: xtn v1.4h, v2.4s
; CHECK-NEXT: mov v0.s[2], v1.s[0]
; CHECK-NEXT: ret
%l = load <2 x i16>, <2 x i16> *%a
%s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Expand All @@ -469,11 +459,11 @@ define <8 x i16> @load_v8i16_2_4(float %tmp, <8 x i16> %b, <2 x i16> *%a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: add x8, x0, #2
; CHECK-NEXT: fmov s0, w9
; CHECK-NEXT: ld1 { v0.h }[2], [x8]
; CHECK-NEXT: uzp1 v2.8h, v0.8h, v0.8h
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: mov v0.s[3], v2.s[0]
; CHECK-NEXT: fmov s2, w9
; CHECK-NEXT: ld1 { v2.h }[2], [x8]
; CHECK-NEXT: xtn v1.4h, v2.4s
; CHECK-NEXT: mov v0.s[3], v1.s[0]
; CHECK-NEXT: ret
%l = load <2 x i16>, <2 x i16> *%a
%s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AArch64/neon-perm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2203,7 +2203,7 @@ entry:
define <16 x i8> @test_undef_vuzp1q_s8(<16 x i8> %a) {
; CHECK-LABEL: test_undef_vuzp1q_s8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uzp1 v0.16b, v0.16b, v0.16b
; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
Expand All @@ -2223,7 +2223,7 @@ entry:
define <8 x i16> @test_undef_vuzp1q_s16(<8 x i16> %a) {
; CHECK-LABEL: test_undef_vuzp1q_s16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uzp1 v0.8h, v0.8h, v0.8h
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
Expand All @@ -2233,7 +2233,7 @@ entry:
define <4 x i32> @test_undef_vuzp1q_s32(<4 x i32> %a) {
; CHECK-LABEL: test_undef_vuzp1q_s32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uzp1 v0.4s, v0.4s, v0.4s
; CHECK-NEXT: xtn v0.2s, v0.2d
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
Expand All @@ -2253,7 +2253,7 @@ entry:
define <16 x i8> @test_undef_vuzp1q_u8(<16 x i8> %a) {
; CHECK-LABEL: test_undef_vuzp1q_u8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uzp1 v0.16b, v0.16b, v0.16b
; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
Expand All @@ -2273,7 +2273,7 @@ entry:
define <8 x i16> @test_undef_vuzp1q_u16(<8 x i16> %a) {
; CHECK-LABEL: test_undef_vuzp1q_u16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uzp1 v0.8h, v0.8h, v0.8h
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
Expand All @@ -2283,7 +2283,7 @@ entry:
define <4 x i32> @test_undef_vuzp1q_u32(<4 x i32> %a) {
; CHECK-LABEL: test_undef_vuzp1q_u32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uzp1 v0.4s, v0.4s, v0.4s
; CHECK-NEXT: xtn v0.2s, v0.2d
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
Expand Down Expand Up @@ -2313,7 +2313,7 @@ entry:
define <16 x i8> @test_undef_vuzp1q_p8(<16 x i8> %a) {
; CHECK-LABEL: test_undef_vuzp1q_p8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uzp1 v0.16b, v0.16b, v0.16b
; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
Expand All @@ -2333,7 +2333,7 @@ entry:
define <8 x i16> @test_undef_vuzp1q_p16(<8 x i16> %a) {
; CHECK-LABEL: test_undef_vuzp1q_p16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uzp1 v0.8h, v0.8h, v0.8h
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
Expand Down

0 comments on commit e348b09

Please sign in to comment.