Skip to content

Commit

Permalink
[AArch64] Cost all perfect shuffles entries as cost 1
Browse files Browse the repository at this point in the history
A brief introduction to perfect shuffles - AArch64 NEON has a number of
shuffle operations - dups, zips, exts, movs etc that can in some way
shuffle around the lanes of a vector. Given a shuffle of size 4 with 2
inputs, some shuffle masks can be easily codegen'd to a single
instruction. A <0,0,1,1> mask for example is a zip LHS, LHS. This is
great, but some masks are not so simple, like a <0,0,1,2>. It turns out
we can generate that from zip LHS, <0,2,0,2>, having generated
<0,2,0,2> from uzp LHS, LHS, producing the result in 2 instructions.

It is not obvious from a given mask how to get there though. So we have
a simple program (PerfectShuffle.cpp in the util folder) that can scan
through all combinations of 4-element vectors and generate the perfect
combination of results needed for each shuffle mask (for some definition
of perfect). This is run offline to generate a table that is queried for
generating shuffle instructions. (Because the table could get quite big,
it is limited to 4 element vectors).

In the perfect shuffle tables zip, unz and trn shuffles were being cost
as 2, which is higher than needed and skews the perfect shuffle tables
to create inefficient combinations. This sets them to 1 and regenerates
the tables. The codegen will usually be better and the costs should be
more precise (but it can get less second-order re-use of values from
multiple shuffles, these cases should be fixed up in subsequent patches.

Differential Revision: https://reviews.llvm.org/D123379
  • Loading branch information
davemgreen committed Apr 19, 2022
1 parent 8daffd1 commit 50af827
Show file tree
Hide file tree
Showing 11 changed files with 3,973 additions and 3,973 deletions.
7,712 changes: 3,855 additions & 3,857 deletions llvm/lib/Target/AArch64/AArch64PerfectShuffle.h

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AArch64/aarch64-wide-shuffle.ll
Expand Up @@ -8,9 +8,9 @@ define <4 x i16> @f(<4 x i32> %vqdmlal_v3.i, <8 x i16> %x5) {
; CHECK-LABEL: f:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: dup v0.4h, v0.h[0]
; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #2
; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #4
; CHECK-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-NEXT: ext v1.8b, v0.8b, v1.8b, #4
; CHECK-NEXT: uzp1 v0.4h, v1.4h, v0.4h
; CHECK-NEXT: ret
entry:
; Check that we don't just dup the input vector. The code emitted is ext, dup, ext, ext
Expand Down
5 changes: 3 additions & 2 deletions llvm/test/CodeGen/AArch64/arm64-dup.ll
Expand Up @@ -446,9 +446,10 @@ define <4 x float> @test_perfectshuffle_dupext_v4f32(<4 x float> %a, <4 x float>
define void @disguised_dup(<4 x float> %x, <4 x float>* %p1, <4 x float>* %p2) {
; CHECK-LABEL: disguised_dup:
; CHECK: // %bb.0:
; CHECK-NEXT: ext.16b v1, v0, v0, #12
; CHECK-NEXT: uzp1.4s v1, v0, v0
; CHECK-NEXT: uzp2.4s v2, v0, v1
; CHECK-NEXT: dup.4s v0, v0[0]
; CHECK-NEXT: ext.16b v1, v1, v0, #8
; CHECK-NEXT: uzp1.4s v1, v2, v1
; CHECK-NEXT: str q1, [x0]
; CHECK-NEXT: str q0, [x1]
; CHECK-NEXT: ret
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AArch64/arm64-rev.ll
Expand Up @@ -561,8 +561,8 @@ define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: ldr q1, [x0]
; CHECK-NEXT: dup.4s v0, v0[0]
; CHECK-NEXT: ext.16b v0, v1, v0, #12
; CHECK-NEXT: rev64.4s v0, v0
; CHECK-NEXT: trn2.4s v1, v1, v0
; CHECK-NEXT: ext.16b v0, v1, v0, #4
; CHECK-NEXT: str q0, [x1, #176]
; CHECK-NEXT: ret
;
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AArch64/build-vector-extract.ll
Expand Up @@ -31,8 +31,8 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_undef(<4 x i32> %x) {
; CHECK-LABEL: extract1_i32_zext_insert0_i64_undef:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: zip1 v0.4s, v0.4s, v0.4s
; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #12
; CHECK-NEXT: zip1 v1.4s, v0.4s, v1.4s
; CHECK-NEXT: trn2 v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
%e = extractelement <4 x i32> %x, i32 1
%z = zext i32 %e to i64
Expand All @@ -58,8 +58,8 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_undef(<4 x i32> %x) {
; CHECK-LABEL: extract2_i32_zext_insert0_i64_undef:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: uzp1 v0.4s, v0.4s, v0.4s
; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #12
; CHECK-NEXT: uzp1 v1.4s, v0.4s, v1.4s
; CHECK-NEXT: zip2 v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
%e = extractelement <4 x i32> %x, i32 2
%z = zext i32 %e to i64
Expand Down Expand Up @@ -138,7 +138,7 @@ define <2 x i64> @extract1_i32_zext_insert1_i64_undef(<4 x i32> %x) {
; CHECK-LABEL: extract1_i32_zext_insert1_i64_undef:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: zip1 v0.4s, v0.4s, v0.4s
; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #4
; CHECK-NEXT: ret
%e = extractelement <4 x i32> %x, i32 1
Expand Down
165 changes: 84 additions & 81 deletions llvm/test/CodeGen/AArch64/insert-extend.ll
Expand Up @@ -88,120 +88,123 @@ define i32 @large(i8* nocapture noundef readonly %p1, i32 noundef %st1, i8* noca
; CHECK-NEXT: usubl v5.4s, v5.4h, v7.4h
; CHECK-NEXT: ushll v2.8h, v2.8b, #0
; CHECK-NEXT: ushll v3.8h, v3.8b, #0
; CHECK-NEXT: shl v5.4s, v5.4s, #16
; CHECK-NEXT: shl v7.4s, v16.4s, #16
; CHECK-NEXT: usubl2 v16.4s, v2.8h, v3.8h
; CHECK-NEXT: shl v5.4s, v5.4s, #16
; CHECK-NEXT: usubl v2.4s, v2.4h, v3.4h
; CHECK-NEXT: add v0.4s, v7.4s, v0.4s
; CHECK-NEXT: add v3.4s, v5.4s, v6.4s
; CHECK-NEXT: shl v5.4s, v16.4s, #16
; CHECK-NEXT: shl v2.4s, v2.4s, #16
; CHECK-NEXT: add v0.4s, v7.4s, v0.4s
; CHECK-NEXT: add v1.4s, v2.4s, v1.4s
; CHECK-NEXT: add v2.4s, v5.4s, v4.4s
; CHECK-NEXT: shl v5.4s, v16.4s, #16
; CHECK-NEXT: rev64 v6.4s, v3.4s
; CHECK-NEXT: rev64 v7.4s, v0.4s
; CHECK-NEXT: add v1.4s, v2.4s, v1.4s
; CHECK-NEXT: add v2.4s, v5.4s, v4.4s
; CHECK-NEXT: rev64 v4.4s, v1.4s
; CHECK-NEXT: rev64 v5.4s, v2.4s
; CHECK-NEXT: add v17.4s, v3.4s, v6.4s
; CHECK-NEXT: add v16.4s, v0.4s, v7.4s
; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s
; CHECK-NEXT: add v17.4s, v3.4s, v6.4s
; CHECK-NEXT: add v22.4s, v1.4s, v4.4s
; CHECK-NEXT: uzp2 v18.4s, v17.4s, v16.4s
; CHECK-NEXT: uzp2 v19.4s, v16.4s, v17.4s
; CHECK-NEXT: add v21.4s, v2.4s, v5.4s
; CHECK-NEXT: sub v3.4s, v3.4s, v6.4s
; CHECK-NEXT: add v18.4s, v2.4s, v5.4s
; CHECK-NEXT: add v19.4s, v1.4s, v4.4s
; CHECK-NEXT: trn2 v6.4s, v16.4s, v17.4s
; CHECK-NEXT: trn2 v20.4s, v17.4s, v16.4s
; CHECK-NEXT: sub v2.4s, v2.4s, v5.4s
; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s
; CHECK-NEXT: trn2 v6.4s, v16.4s, v17.4s
; CHECK-NEXT: zip2 v7.4s, v0.4s, v3.4s
; CHECK-NEXT: zip1 v0.4s, v0.4s, v3.4s
; CHECK-NEXT: trn2 v3.4s, v17.4s, v16.4s
; CHECK-NEXT: trn2 v4.4s, v19.4s, v18.4s
; CHECK-NEXT: zip1 v4.4s, v22.4s, v21.4s
; CHECK-NEXT: uzp2 v17.4s, v18.4s, v17.4s
; CHECK-NEXT: zip2 v18.4s, v22.4s, v21.4s
; CHECK-NEXT: uzp2 v16.4s, v19.4s, v16.4s
; CHECK-NEXT: zip1 v5.4s, v1.4s, v2.4s
; CHECK-NEXT: zip2 v18.4s, v19.4s, v18.4s
; CHECK-NEXT: ext v17.16b, v3.16b, v17.16b, #8
; CHECK-NEXT: ext v16.16b, v6.16b, v16.16b, #8
; CHECK-NEXT: ext v4.16b, v19.16b, v4.16b, #8
; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s
; CHECK-NEXT: mov v6.d[1], v4.d[1]
; CHECK-NEXT: mov v16.d[1], v18.d[1]
; CHECK-NEXT: zip2 v7.4s, v0.4s, v3.4s
; CHECK-NEXT: ext v5.16b, v1.16b, v5.16b, #8
; CHECK-NEXT: zip1 v0.4s, v0.4s, v3.4s
; CHECK-NEXT: mov v1.s[3], v2.s[2]
; CHECK-NEXT: sub v3.4s, v6.4s, v16.4s
; CHECK-NEXT: mov v17.d[1], v18.d[1]
; CHECK-NEXT: mov v16.d[1], v18.d[1]
; CHECK-NEXT: mov v6.d[1], v4.d[1]
; CHECK-NEXT: mov v3.d[1], v4.d[1]
; CHECK-NEXT: mov v20.d[1], v4.d[1]
; CHECK-NEXT: rev64 v6.4s, v3.4s
; CHECK-NEXT: mov v0.d[1], v5.d[1]
; CHECK-NEXT: mov v7.d[1], v1.d[1]
; CHECK-NEXT: sub v1.4s, v6.4s, v16.4s
; CHECK-NEXT: add v2.4s, v17.4s, v3.4s
; CHECK-NEXT: rev64 v3.4s, v1.4s
; CHECK-NEXT: add v5.4s, v7.4s, v0.4s
; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s
; CHECK-NEXT: add v2.4s, v17.4s, v20.4s
; CHECK-NEXT: add v5.4s, v3.4s, v6.4s
; CHECK-NEXT: sub v3.4s, v3.4s, v6.4s
; CHECK-NEXT: sub v6.4s, v0.4s, v7.4s
; CHECK-NEXT: add v0.4s, v7.4s, v0.4s
; CHECK-NEXT: rev64 v4.4s, v2.4s
; CHECK-NEXT: rev64 v6.4s, v5.4s
; CHECK-NEXT: rev64 v7.4s, v0.4s
; CHECK-NEXT: add v16.4s, v1.4s, v3.4s
; CHECK-NEXT: sub v1.4s, v1.4s, v3.4s
; CHECK-NEXT: add v3.4s, v2.4s, v4.4s
; CHECK-NEXT: add v17.4s, v0.4s, v7.4s
; CHECK-NEXT: add v18.4s, v5.4s, v6.4s
; CHECK-NEXT: sub v5.4s, v5.4s, v6.4s
; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s
; CHECK-NEXT: ext v16.16b, v1.16b, v16.16b, #12
; CHECK-NEXT: ext v6.16b, v5.16b, v18.16b, #12
; CHECK-NEXT: rev64 v7.4s, v6.4s
; CHECK-NEXT: rev64 v16.4s, v0.4s
; CHECK-NEXT: add v1.4s, v2.4s, v4.4s
; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s
; CHECK-NEXT: ext v4.16b, v0.16b, v17.16b, #12
; CHECK-NEXT: rev64 v3.4s, v3.4s
; CHECK-NEXT: ext v7.16b, v16.16b, v1.16b, #4
; CHECK-NEXT: ext v17.16b, v6.16b, v5.16b, #4
; CHECK-NEXT: ext v18.16b, v6.16b, v6.16b, #8
; CHECK-NEXT: ext v19.16b, v4.16b, v0.16b, #4
; CHECK-NEXT: ext v20.16b, v4.16b, v4.16b, #8
; CHECK-NEXT: ext v21.16b, v16.16b, v16.16b, #8
; CHECK-NEXT: rev64 v16.4s, v16.4s
; CHECK-NEXT: trn2 v2.4s, v3.4s, v2.4s
; CHECK-NEXT: rev64 v3.4s, v4.4s
; CHECK-NEXT: rev64 v4.4s, v6.4s
; CHECK-NEXT: ext v17.16b, v17.16b, v18.16b, #12
; CHECK-NEXT: ext v18.16b, v19.16b, v20.16b, #12
; CHECK-NEXT: ext v7.16b, v7.16b, v21.16b, #12
; CHECK-NEXT: ext v1.16b, v16.16b, v1.16b, #4
; CHECK-NEXT: ext v6.16b, v2.16b, v2.16b, #8
; CHECK-NEXT: ext v0.16b, v3.16b, v0.16b, #4
; CHECK-NEXT: ext v3.16b, v4.16b, v5.16b, #4
; CHECK-NEXT: add v4.4s, v7.4s, v1.4s
; CHECK-NEXT: add v5.4s, v2.4s, v6.4s
; CHECK-NEXT: add v16.4s, v18.4s, v0.4s
; CHECK-NEXT: add v19.4s, v17.4s, v3.4s
; CHECK-NEXT: sub v1.4s, v7.4s, v1.4s
; CHECK-NEXT: sub v2.4s, v2.4s, v6.4s
; CHECK-NEXT: sub v3.4s, v17.4s, v3.4s
; CHECK-NEXT: sub v0.4s, v18.4s, v0.4s
; CHECK-NEXT: mov v19.d[1], v3.d[1]
; CHECK-NEXT: mov v16.d[1], v0.d[1]
; CHECK-NEXT: mov v4.d[1], v1.d[1]
; CHECK-NEXT: mov v5.d[1], v2.d[1]
; CHECK-NEXT: ext v4.16b, v3.16b, v5.16b, #12
; CHECK-NEXT: add v5.4s, v6.4s, v7.4s
; CHECK-NEXT: add v17.4s, v0.4s, v16.4s
; CHECK-NEXT: sub v0.4s, v0.4s, v16.4s
; CHECK-NEXT: sub v6.4s, v6.4s, v7.4s
; CHECK-NEXT: ext v7.16b, v0.16b, v17.16b, #12
; CHECK-NEXT: ext v5.16b, v6.16b, v5.16b, #12
; CHECK-NEXT: rev64 v22.4s, v1.4s
; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #4
; CHECK-NEXT: ext v16.16b, v4.16b, v3.16b, #4
; CHECK-NEXT: ext v17.16b, v4.16b, v4.16b, #8
; CHECK-NEXT: ext v18.16b, v7.16b, v0.16b, #4
; CHECK-NEXT: ext v19.16b, v7.16b, v7.16b, #8
; CHECK-NEXT: ext v20.16b, v5.16b, v6.16b, #4
; CHECK-NEXT: ext v21.16b, v5.16b, v5.16b, #8
; CHECK-NEXT: rev64 v5.4s, v5.4s
; CHECK-NEXT: rev64 v7.4s, v7.4s
; CHECK-NEXT: rev64 v4.4s, v4.4s
; CHECK-NEXT: trn2 v1.4s, v2.4s, v1.4s
; CHECK-NEXT: ext v16.16b, v16.16b, v17.16b, #12
; CHECK-NEXT: ext v17.16b, v18.16b, v19.16b, #12
; CHECK-NEXT: ext v18.16b, v20.16b, v21.16b, #12
; CHECK-NEXT: trn2 v19.4s, v22.4s, v2.4s
; CHECK-NEXT: ext v2.16b, v5.16b, v6.16b, #4
; CHECK-NEXT: ext v0.16b, v7.16b, v0.16b, #4
; CHECK-NEXT: ext v3.16b, v4.16b, v3.16b, #4
; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #4
; CHECK-NEXT: add v4.4s, v18.4s, v2.4s
; CHECK-NEXT: add v5.4s, v17.4s, v0.4s
; CHECK-NEXT: add v6.4s, v16.4s, v3.4s
; CHECK-NEXT: add v7.4s, v19.4s, v1.4s
; CHECK-NEXT: sub v2.4s, v18.4s, v2.4s
; CHECK-NEXT: sub v0.4s, v17.4s, v0.4s
; CHECK-NEXT: sub v1.4s, v19.4s, v1.4s
; CHECK-NEXT: sub v3.4s, v16.4s, v3.4s
; CHECK-NEXT: mov v7.d[1], v1.d[1]
; CHECK-NEXT: mov v6.d[1], v3.d[1]
; CHECK-NEXT: mov v4.d[1], v2.d[1]
; CHECK-NEXT: mov v5.d[1], v0.d[1]
; CHECK-NEXT: movi v0.8h, #1
; CHECK-NEXT: movi v7.2d, #0x00ffff0000ffff
; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff
; CHECK-NEXT: ushr v1.4s, v4.4s, #15
; CHECK-NEXT: ushr v2.4s, v19.4s, #15
; CHECK-NEXT: ushr v2.4s, v7.4s, #15
; CHECK-NEXT: ushr v3.4s, v5.4s, #15
; CHECK-NEXT: ushr v6.4s, v16.4s, #15
; CHECK-NEXT: ushr v16.4s, v6.4s, #15
; CHECK-NEXT: and v2.16b, v2.16b, v0.16b
; CHECK-NEXT: and v6.16b, v6.16b, v0.16b
; CHECK-NEXT: and v16.16b, v16.16b, v0.16b
; CHECK-NEXT: and v3.16b, v3.16b, v0.16b
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: mul v1.4s, v2.4s, v7.4s
; CHECK-NEXT: mul v2.4s, v6.4s, v7.4s
; CHECK-NEXT: mul v0.4s, v0.4s, v7.4s
; CHECK-NEXT: mul v3.4s, v3.4s, v7.4s
; CHECK-NEXT: add v6.4s, v1.4s, v19.4s
; CHECK-NEXT: add v7.4s, v2.4s, v16.4s
; CHECK-NEXT: mul v1.4s, v2.4s, v17.4s
; CHECK-NEXT: mul v2.4s, v16.4s, v17.4s
; CHECK-NEXT: mul v0.4s, v0.4s, v17.4s
; CHECK-NEXT: mul v3.4s, v3.4s, v17.4s
; CHECK-NEXT: add v7.4s, v1.4s, v7.4s
; CHECK-NEXT: add v6.4s, v2.4s, v6.4s
; CHECK-NEXT: add v4.4s, v0.4s, v4.4s
; CHECK-NEXT: add v5.4s, v3.4s, v5.4s
; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b
; CHECK-NEXT: eor v3.16b, v5.16b, v3.16b
; CHECK-NEXT: eor v2.16b, v7.16b, v2.16b
; CHECK-NEXT: eor v1.16b, v6.16b, v1.16b
; CHECK-NEXT: eor v2.16b, v6.16b, v2.16b
; CHECK-NEXT: eor v1.16b, v7.16b, v1.16b
; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-NEXT: add v0.4s, v3.4s, v0.4s
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: lsr w9, w8, #16
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AArch64/neon-wide-splat.ll
Expand Up @@ -109,9 +109,9 @@ define <8 x i8> @shuffle_not1(<16 x i8> %v) {
define <4 x i32> @shuffle_not2(<4 x i32> %v) {
; CHECK-LABEL: shuffle_not2:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #4
; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8
; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8
; CHECK-NEXT: trn1 v1.4s, v0.4s, v0.4s
; CHECK-NEXT: zip1 v0.4s, v0.4s, v0.4s
; CHECK-NEXT: zip2 v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
entry:
%res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 2>
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
Expand Up @@ -232,9 +232,9 @@ define <8 x i16> @shuffle4_v8i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x
define <4 x i32> @shuffle4_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
; CHECK-LABEL: shuffle4_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v2.16b, v2.16b, v3.16b, #8
; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #8
; CHECK-NEXT: ext v1.16b, v3.16b, v2.16b, #12
; CHECK-NEXT: rev64 v3.4s, v3.4s
; CHECK-NEXT: zip1 v4.4s, v1.4s, v1.4s
; CHECK-NEXT: zip2 v1.4s, v3.4s, v2.4s
; CHECK-NEXT: ext v0.16b, v4.16b, v0.16b, #4
; CHECK-NEXT: mov v1.d[1], v0.d[1]
; CHECK-NEXT: mov v0.16b, v1.16b
Expand Down
15 changes: 7 additions & 8 deletions llvm/test/CodeGen/AArch64/shuffles.ll
Expand Up @@ -4,11 +4,11 @@
define <16 x i32> @test_shuf1(<16 x i32> %x, <16 x i32> %y) {
; CHECK-LABEL: test_shuf1:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v3.16b, v6.16b, v7.16b, #8
; CHECK-NEXT: zip2 v3.4s, v7.4s, v6.4s
; CHECK-NEXT: ext v5.16b, v6.16b, v4.16b, #12
; CHECK-NEXT: uzp1 v6.4s, v1.4s, v0.4s
; CHECK-NEXT: uzp2 v4.4s, v2.4s, v4.4s
; CHECK-NEXT: ext v3.16b, v3.16b, v3.16b, #12
; CHECK-NEXT: trn2 v3.4s, v7.4s, v3.4s
; CHECK-NEXT: ext v5.16b, v7.16b, v5.16b, #8
; CHECK-NEXT: trn2 v6.4s, v6.4s, v1.4s
; CHECK-NEXT: trn1 v2.4s, v4.4s, v2.4s
Expand All @@ -28,9 +28,9 @@ define <16 x i32> @test_shuf1(<16 x i32> %x, <16 x i32> %y) {
define <4 x i32> @test_shuf2(<16 x i32> %x, <16 x i32> %y) {
; CHECK-LABEL: test_shuf2:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v6.16b, v7.16b, #8
; CHECK-NEXT: zip2 v0.4s, v7.4s, v6.4s
; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #12
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #12
; CHECK-NEXT: trn2 v0.4s, v7.4s, v0.4s
; CHECK-NEXT: ext v0.16b, v1.16b, v0.16b, #8
; CHECK-NEXT: ext v0.16b, v0.16b, v2.16b, #8
; CHECK-NEXT: ret
Expand Down Expand Up @@ -78,8 +78,8 @@ define <4 x i32> @test1503(<4 x i32> %a, <4 x i32> %b)
; CHECK: // %bb.0:
; CHECK-NEXT: ext v1.16b, v0.16b, v1.16b, #12
; CHECK-NEXT: zip1 v0.4s, v0.4s, v1.4s
; CHECK-NEXT: trn1 v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ext v0.16b, v1.16b, v0.16b, #8
; CHECK-NEXT: trn1 v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: ret
{
%r = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 0, i32 3>
Expand All @@ -89,9 +89,8 @@ define <4 x i32> @test1503(<4 x i32> %a, <4 x i32> %b)
define <4 x i32> @test4366(<4 x i32> %a, <4 x i32> %b)
; CHECK-LABEL: test4366:
; CHECK: // %bb.0:
; CHECK-NEXT: zip2 v2.4s, v1.4s, v0.4s
; CHECK-NEXT: uzp1 v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ext v0.16b, v0.16b, v2.16b, #4
; CHECK-NEXT: trn2 v0.4s, v0.4s, v1.4s
; CHECK-NEXT: zip2 v0.4s, v1.4s, v0.4s
; CHECK-NEXT: ret
{
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/AArch64/sinksplat.ll
Expand Up @@ -336,9 +336,8 @@ define <4 x i32> @smull_nonsplat(<4 x i16> %x, <4 x i16> *%y) {
; CHECK-NEXT: fmov d1, d0
; CHECK-NEXT: mov w8, #1
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: dup v2.4h, v1.h[3]
; CHECK-NEXT: ext v2.8b, v1.8b, v2.8b, #4
; CHECK-NEXT: ext v1.8b, v1.8b, v2.8b, #6
; CHECK-NEXT: trn2 v2.4h, v1.4h, v1.4h
; CHECK-NEXT: zip2 v1.4h, v2.4h, v1.4h
; CHECK-NEXT: .LBB10_1: // %l1
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr d2, [x0]
Expand Down
12 changes: 6 additions & 6 deletions llvm/utils/PerfectShuffle/PerfectShuffle.cpp
Expand Up @@ -545,27 +545,27 @@ vext<2> the_vext2("vext2", OP_VEXT2);
vext<3> the_vext3("vext3", OP_VEXT3);

struct vuzpl : public Operator {
vuzpl() : Operator(0x0246, "vuzpl", OP_VUZPL, 2) {}
vuzpl() : Operator(0x0246, "vuzpl", OP_VUZPL, 1) {}
} the_vuzpl;

struct vuzpr : public Operator {
vuzpr() : Operator(0x1357, "vuzpr", OP_VUZPR, 2) {}
vuzpr() : Operator(0x1357, "vuzpr", OP_VUZPR, 1) {}
} the_vuzpr;

struct vzipl : public Operator {
vzipl() : Operator(0x0415, "vzipl", OP_VZIPL, 2) {}
vzipl() : Operator(0x0415, "vzipl", OP_VZIPL, 1) {}
} the_vzipl;

struct vzipr : public Operator {
vzipr() : Operator(0x2637, "vzipr", OP_VZIPR, 2) {}
vzipr() : Operator(0x2637, "vzipr", OP_VZIPR, 1) {}
} the_vzipr;

struct vtrnl : public Operator {
vtrnl() : Operator(0x0426, "vtrnl", OP_VTRNL, 2) {}
vtrnl() : Operator(0x0426, "vtrnl", OP_VTRNL, 1) {}
} the_vtrnl;

struct vtrnr : public Operator {
vtrnr() : Operator(0x1537, "vtrnr", OP_VTRNR, 2) {}
vtrnr() : Operator(0x1537, "vtrnr", OP_VTRNR, 1) {}
} the_vtrnr;

#endif

0 comments on commit 50af827

Please sign in to comment.