Skip to content

Commit

Permalink
tree-optimization/94864 - vector insert of vector extract simplification
Browse files Browse the repository at this point in the history
The PRs ask for optimizing of

  _1 = BIT_FIELD_REF <b_3(D), 64, 64>;
  result_4 = BIT_INSERT_EXPR <a_2(D), _1, 64>;

to a vector permutation.  The following implements this as
match.pd pattern, improving code generation on x86_64.

On the RTL level we face the issue that backend patterns inconsistently
use vec_merge and vec_select of vec_concat to represent permutes.

I think using a (supported) permute is almost always better
than an extract plus insert, maybe excluding the case we extract
element zero and that's aliased to a register that can be used
directly for insertion (not sure how to query that).

The patch FAILs one case in gcc.target/i386/avx512fp16-vmovsh-1a.c
where we now expand from

 __A_28 = VEC_PERM_EXPR <x2.8_9, x1.9_10, { 0, 9, 10, 11, 12, 13, 14, 15 }>;

instead of

 _28 = BIT_FIELD_REF <x2.8_9, 16, 0>;
 __A_29 = BIT_INSERT_EXPR <x1.9_10, _28, 0>;

producing a vpblendw instruction instead of the expected vmovsh.  That's
either a missed vec_perm_const expansion optimization or even better,
an improvement - Zen4 for example has 4 ports to execute vpblendw
but only 3 for executing vmovsh and both instructions have the same size.

The patch XFAILs the sub-testcase - is that OK or should I update
the expected instruction to a vpblend?

Bootstrapped and tested on x86_64-unknown-linux-gnu.

Thanks,
Richard.

	PR tree-optimization/94864
	PR tree-optimization/94865
	PR tree-optimization/93080
	* match.pd (bit_insert @0 (BIT_FIELD_REF @1 ..) ..): New pattern
	for vector insertion from vector extraction.

	* gcc.target/i386/pr94864.c: New testcase.
	* gcc.target/i386/pr94865.c: Likewise.
	* gcc.target/i386/avx512fp16-vmovsh-1a.c: XFAIL.
	* gcc.dg/tree-ssa/forwprop-40.c: Likewise.
	* gcc.dg/tree-ssa/forwprop-41.c: Likewise.
  • Loading branch information
rguenth authored and ouuleilei-bot committed Aug 22, 2023
1 parent c2d62cd commit 731fd04
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 1 deletion.
25 changes: 25 additions & 0 deletions gcc/match.pd
Original file line number Diff line number Diff line change
Expand Up @@ -7653,6 +7653,31 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
wi::to_wide (@ipos) + isize))
(BIT_FIELD_REF @0 @rsize @rpos)))))

/* Simplify vector inserts of other vector extracts to a permute. */
(simplify
(bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos)
(if (VECTOR_TYPE_P (type)
&& types_match (@0, @1)
&& types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
&& TYPE_VECTOR_SUBPARTS (type).is_constant ())
(with
{
unsigned HOST_WIDE_INT elsz
= tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@1))));
poly_uint64 relt = exact_div (tree_to_poly_uint64 (@rpos), elsz);
poly_uint64 ielt = exact_div (tree_to_poly_uint64 (@ipos), elsz);
unsigned nunits = TYPE_VECTOR_SUBPARTS (type).to_constant ();
vec_perm_builder builder;
builder.new_vector (nunits, nunits, 1);
for (unsigned i = 0; i < nunits; ++i)
builder.quick_push (known_eq (ielt, i) ? nunits + relt : i);
vec_perm_indices sel (builder, 2, nunits);
}
(if (!VECTOR_MODE_P (TYPE_MODE (type))
|| can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel, false))
(vec_perm @0 @1 { vec_perm_indices_to_tree
(build_vector_type (ssizetype, nunits), sel); })))))

(if (canonicalize_math_after_vectorization_p ())
(for fmas (FMA)
(simplify
Expand Down
14 changes: 14 additions & 0 deletions gcc/testsuite/gcc.dg/tree-ssa/forwprop-40.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
/* { dg-do compile } */
/* { dg-options "-O2 -fdump-tree-optimized -Wno-psabi -w" } */

#define vector __attribute__((__vector_size__(16) ))

vector int g(vector int a)
{
int b = a[0];
a[0] = b;
return a;
}

/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 0 "optimized" } } */
/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 0 "optimized" } } */
16 changes: 16 additions & 0 deletions gcc/testsuite/gcc.dg/tree-ssa/forwprop-41.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
/* { dg-do compile } */
/* { dg-options "-O2 -fdump-tree-optimized -Wno-psabi -w" } */

#define vector __attribute__((__vector_size__(16) ))

vector int g(vector int a, int c)
{
int b = a[2];
a[2] = b;
a[1] = c;
return a;
}

/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 1 "optimized" } } */
/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 0 "optimized" } } */
/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 0 "optimized" } } */
2 changes: 1 addition & 1 deletion gcc/testsuite/gcc.target/i386/avx512fp16-vmovsh-1a.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
/* { dg-final { scan-assembler-times "vmovsh\[ \\t\]+%xmm\[0-9\]+\[^\n\r\]*%\[er\]\[ad]x+\[^\n\r]*\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
/* { dg-final { scan-assembler-times "vmovsh\[ \\t\]+\[^\n\r\]*%\[er\]\[ad]x+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */
/* { dg-final { scan-assembler-times "vmovsh\[ \\t\]+\[^\n\r\]*%\[er\]\[ad]x+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
/* { dg-final { scan-assembler-times "vmovsh\[ \\t\]+%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
/* { dg-final { scan-assembler-times "vmovsh\[ \\t\]+%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times "vmovsh\[ \\t\]+%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^z\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
/* { dg-final { scan-assembler-times "vmovsh\[ \\t\]+%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */

Expand Down
13 changes: 13 additions & 0 deletions gcc/testsuite/gcc.target/i386/pr94864.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
/* { dg-do compile } */
/* { dg-options "-O2 -msse2 -mno-avx" } */

typedef double v2df __attribute__((vector_size(16)));

v2df move_sd(v2df a, v2df b)
{
v2df result = a;
result[0] = b[1];
return result;
}

/* { dg-final { scan-assembler "unpckhpd\[\\t \]%xmm0, %xmm1" } } */
13 changes: 13 additions & 0 deletions gcc/testsuite/gcc.target/i386/pr94865.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
/* { dg-do compile } */
/* { dg-options "-O2 -msse2 -mno-avx" } */

typedef double v2df __attribute__((vector_size(16)));

v2df move_sd(v2df a, v2df b)
{
v2df result = a;
result[1] = b[1];
return result;
}

/* { dg-final { scan-assembler "shufpd\[\\t \]*.2, %xmm1, %xmm0" } } */

0 comments on commit 731fd04

Please sign in to comment.