Skip to content

Commit

Permalink
AArch64: Improve immediate expansion [PR105928]
Browse files Browse the repository at this point in the history
Support immediate expansion of immediates which can be created from 2 MOVKs
and a shifted ORR or BIC instruction.  Change aarch64_split_dimode_const_store
to apply if we save one instruction.

This reduces the number of 4-instruction immediates in SPECINT/FP by 5%.

Passes regress, OK for commit?

gcc/ChangeLog:
	PR target/105928
	* config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
	Add support for immediates using shifted ORR/BIC.
        (aarch64_split_dimode_const_store): Apply if we save one instruction.
        * config/aarch64/aarch64.md (<LOGICAL:optab>_<SHIFT:optab><mode>3):
        Make pattern global.

gcc/testsuite:
	PR target/105928
	* gcc.target/aarch64/pr105928.c: Add new test.
        * gcc.target/aarch64/vect-cse-codegen.c: Fix test.
  • Loading branch information
Wilco1 authored and ouuleilei-bot committed Sep 15, 2023
1 parent c2d62cd commit 2d47092
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 14 deletions.
43 changes: 32 additions & 11 deletions gcc/config/aarch64/aarch64.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5635,7 +5635,7 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
machine_mode mode)
{
int i;
unsigned HOST_WIDE_INT val, val2, mask;
unsigned HOST_WIDE_INT val, val2, val3, mask;
int one_match, zero_match;
int num_insns;

Expand Down Expand Up @@ -5717,6 +5717,35 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
}
return 3;
}

/* Try shifting and inserting the bottom 32-bits into the top bits. */
val2 = val & 0xffffffff;
val3 = 0xffffffff;
val3 = val2 | (val3 << 32);
for (i = 17; i < 48; i++)
if ((val2 | (val2 << i)) == val)
{
if (generate)
{
emit_insn (gen_rtx_SET (dest, GEN_INT (val2 & 0xffff)));
emit_insn (gen_insv_immdi (dest, GEN_INT (16),
GEN_INT (val2 >> 16)));
emit_insn (gen_ior_ashldi3 (dest, dest, GEN_INT (i), dest));
}
return 3;
}
else if ((val3 & ~(val3 << i)) == val)
{
if (generate)
{
emit_insn (gen_rtx_SET (dest, GEN_INT (val3 | 0xffff0000)));
emit_insn (gen_insv_immdi (dest, GEN_INT (16),
GEN_INT (val2 >> 16)));
emit_insn (gen_and_one_cmpl_ashldi3 (dest, dest, GEN_INT (i),
dest));
}
return 3;
}
}

/* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
Expand Down Expand Up @@ -25184,8 +25213,6 @@ aarch64_split_dimode_const_store (rtx dst, rtx src)
rtx lo = gen_lowpart (SImode, src);
rtx hi = gen_highpart_mode (SImode, DImode, src);

bool size_p = optimize_function_for_size_p (cfun);

if (!rtx_equal_p (lo, hi))
return false;

Expand All @@ -25204,14 +25231,8 @@ aarch64_split_dimode_const_store (rtx dst, rtx src)
MOV w1, 49370
MOVK w1, 0x140, lsl 16
STP w1, w1, [x0]
So we want to perform this only when we save two instructions
or more. When optimizing for size, however, accept any code size
savings we can. */
if (size_p && orig_cost <= lo_cost)
return false;

if (!size_p
&& (orig_cost <= lo_cost + 1))
So we want to perform this when we save at least one instruction. */
if (orig_cost <= lo_cost)
return false;

rtx mem_lo = adjust_address (dst, SImode, 0);
Expand Down
2 changes: 1 addition & 1 deletion gcc/config/aarch64/aarch64.md
Original file line number Diff line number Diff line change
Expand Up @@ -4642,7 +4642,7 @@
[(set_attr "type" "logics_shift_imm")]
)

(define_insn "*<LOGICAL:optab>_<SHIFT:optab><mode>3"
(define_insn "<LOGICAL:optab>_<SHIFT:optab><mode>3"
[(set (match_operand:GPI 0 "register_operand" "=r")
(LOGICAL:GPI (SHIFT:GPI
(match_operand:GPI 1 "register_operand" "r")
Expand Down
43 changes: 43 additions & 0 deletions gcc/testsuite/gcc.target/aarch64/pr105928.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/* { dg-do assemble } */
/* { dg-options "-O2 --save-temps" } */

long f1 (void)
{
return 0x80402010080400;
}

long f2 (void)
{
return 0x1234567812345678;
}

long f3 (void)
{
return 0x4567800012345678;
}

long f4 (void)
{
return 0x3ecccccd3ecccccd;
}

long f5 (void)
{
return 0x38e38e38e38e38e;
}

long f6 (void)
{
return 0x1745d1745d1745d;
}

void f7 (long *p)
{
*p = 0x1234567812345678;
}

/* { dg-final { scan-assembler-times {\tmovk\t} 7 } } */
/* { dg-final { scan-assembler-times {\tmov\t} 7 } } */
/* { dg-final { scan-assembler-times {\tbic\t} 2 } } */
/* { dg-final { scan-assembler-times {\torr\t} 4 } } */
/* { dg-final { scan-assembler-times {\tstp\t} 1 } } */
3 changes: 1 addition & 2 deletions gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,7 @@ test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
** ushr v[0-9]+.16b, v[0-9]+.16b, 7
** mov x[0-9]+, 16512
** movk x[0-9]+, 0x1020, lsl 16
** movk x[0-9]+, 0x408, lsl 32
** movk x[0-9]+, 0x102, lsl 48
** orr x[0-9]+, x[0-9]+, x[0-9]+, lsl 28
** fmov d[0-9]+, x[0-9]+
** pmull v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d
** dup v[0-9]+.2d, v[0-9]+.d\[0\]
Expand Down

0 comments on commit 2d47092

Please sign in to comment.