Skip to content

Commit

Permalink
aarch64: Restore vectorisation of vld1 inputs [PR109072]
Browse files Browse the repository at this point in the history
Before GCC 12, we would vectorize:

  int32_t arr[] = { x, x, x, x };

at -O3.  Vectorizing the store on its own is often a loss, particularly
for integers, so g:4963079769c99c4073adfd799885410ad484cbbe suppressed it.
This was necessary to fix regressions from enabling vectorisation at -O2,

However, the vectorisation is important if the code subsequently loads
from the array using vld1:

  return vld1q_s32 (arr);

This approach of initialising an array and loading from it is the
recommend endian-agnostic way of constructing an ACLE vector.

As discussed in the PR notes, the general fix would be to fold the
store and load-back to a constructor (preferably before vectorisation).
But that's clearly not stage 4 material.

This patch instead delays folding vld1 until after inlining and
records which decls a vld1 loads from.  It then treats vector
stores to those decls as free, on the optimistic assumption that
they will be removed later.  The patch also brute-forces
vectorization of plain constructor+store sequences, since some
of the CPU costs make that (dubiously) expensive even when the
store is discounted.

Delaying folding showed that we were failing to update the vops.
The patch fixes that too.

Thanks to Tamar for discussion & help with testing.

gcc/
	PR target/109072
	* config/aarch64/aarch64-protos.h (aarch64_vector_load_decl): Declare.
	* config/aarch64/aarch64.h (machine_function::vector_load_decls): New
	variable.
	* config/aarch64/aarch64-builtins.cc (aarch64_record_vector_load_arg):
	New function.
	(aarch64_general_gimple_fold_builtin): Delay folding of vld1 until
	after inlining.  Record which decls are loaded from.  Fix handling
	of vops for loads and stores.
	* config/aarch64/aarch64.cc (aarch64_vector_load_decl): New function.
	(aarch64_accesses_vector_load_decl_p): Likewise.
	(aarch64_vector_costs::m_stores_to_vector_load_decl): New member
	variable.
	(aarch64_vector_costs::add_stmt_cost): If the function has a vld1
	that loads from a decl, treat vector stores to those decls as
	zero cost.
	(aarch64_vector_costs::finish_cost): ...and in that case,
	if the vector code does nothing more than a store, give the
	prologue a zero cost as well.

gcc/testsuite/
	PR target/109072
	* gcc.target/aarch64/pr109072_1.c: New test.
	* gcc.target/aarch64/pr109072_2.c: Likewise.
  • Loading branch information
rsandifo-arm committed Mar 28, 2023
1 parent 75cda3b commit fcb4115
Show file tree
Hide file tree
Showing 6 changed files with 435 additions and 4 deletions.
22 changes: 22 additions & 0 deletions gcc/config/aarch64/aarch64-builtins.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2994,6 +2994,19 @@ get_mem_type_for_load_store (unsigned int fcode)
}
}

/* We've seen a vector load from address ADDR. Record it in
vector_load_decls, if appropriate. */
static void
aarch64_record_vector_load_arg (tree addr)
{
tree decl = aarch64_vector_load_decl (addr);
if (!decl)
return;
if (!cfun->machine->vector_load_decls)
cfun->machine->vector_load_decls = hash_set<tree>::create_ggc (31);
cfun->machine->vector_load_decls->add (decl);
}

/* Try to fold STMT, given that it's a call to the built-in function with
subcode FCODE. Return the new statement on success and null on
failure. */
Expand Down Expand Up @@ -3051,6 +3064,11 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD)
BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD)
BUILTIN_VALLP_NO_DI (LOAD1_P, ld1, 0, LOAD)
/* Punt until after inlining, so that we stand more chance of
recording something meaningful in vector_load_decls. */
if (!cfun->after_inlining)
break;
aarch64_record_vector_load_arg (args[0]);
if (!BYTES_BIG_ENDIAN)
{
enum aarch64_simd_type mem_type
Expand All @@ -3069,6 +3087,8 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
fold_build2 (MEM_REF,
access_type,
args[0], zero));
gimple_set_vuse (new_stmt, gimple_vuse (stmt));
gimple_set_vdef (new_stmt, gimple_vdef (stmt));
}
break;

Expand All @@ -3092,6 +3112,8 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
= gimple_build_assign (fold_build2 (MEM_REF, access_type,
args[0], zero),
args[1]);
gimple_set_vuse (new_stmt, gimple_vuse (stmt));
gimple_set_vdef (new_stmt, gimple_vdef (stmt));
}
break;

Expand Down
1 change: 1 addition & 0 deletions gcc/config/aarch64/aarch64-protos.h
Original file line number Diff line number Diff line change
Expand Up @@ -761,6 +761,7 @@ bool aarch64_const_vec_all_same_in_range_p (rtx, HOST_WIDE_INT,
bool aarch64_constant_address_p (rtx);
bool aarch64_emit_approx_div (rtx, rtx, rtx);
bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
tree aarch64_vector_load_decl (tree);
void aarch64_expand_call (rtx, rtx, rtx, bool);
bool aarch64_expand_cpymem (rtx *);
bool aarch64_expand_setmem (rtx *);
Expand Down
70 changes: 66 additions & 4 deletions gcc/config/aarch64/aarch64.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15661,6 +15661,33 @@ aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,

/* Vectorizer cost model target hooks. */

/* If a vld1 from address ADDR should be recorded in vector_load_decls,
return the decl that should be recorded. Return null otherwise. */
tree
aarch64_vector_load_decl (tree addr)
{
if (TREE_CODE (addr) != ADDR_EXPR)
return NULL_TREE;
tree base = get_base_address (TREE_OPERAND (addr, 0));
if (TREE_CODE (base) != VAR_DECL)
return NULL_TREE;
return base;
}

/* Return true if STMT_INFO accesses a decl that is known to be the
argument to a vld1 in the same function. */
static bool
aarch64_accesses_vector_load_decl_p (stmt_vec_info stmt_info)
{
if (!cfun->machine->vector_load_decls)
return false;
auto dr = STMT_VINFO_DATA_REF (stmt_info);
if (!dr)
return false;
tree decl = aarch64_vector_load_decl (DR_BASE_ADDRESS (dr));
return decl && cfun->machine->vector_load_decls->contains (decl);
}

/* Information about how the CPU would issue the scalar, Advanced SIMD
or SVE version of a vector loop, using the scheme defined by the
aarch64_base_vec_issue_info hierarchy of structures. */
Expand Down Expand Up @@ -15891,6 +15918,20 @@ class aarch64_vector_costs : public vector_costs
supported by Advanced SIMD and SVE2. */
bool m_has_avg = false;

/* True if the vector body contains a store to a decl and if the
function is known to have a vld1 from the same decl.

In the Advanced SIMD ACLE, the recommended endian-agnostic way of
initializing a vector is:

float f[4] = { elts };
float32x4_t x = vld1q_f32(f);

We should strongly prefer vectorization of the initialization of f,
so that the store to f and the load back can be optimized away,
leaving a vectorization of { elts }. */
bool m_stores_to_vector_load_decl = false;

/* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
- If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
SIMD code.
Expand Down Expand Up @@ -16907,6 +16948,18 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
}
}
}

/* If the statement stores to a decl that is known to be the argument
to a vld1 in the same function, ignore the store for costing purposes.
See the comment above m_stores_to_vector_load_decl for more details. */
if (stmt_info
&& (kind == vector_store || kind == unaligned_store)
&& aarch64_accesses_vector_load_decl_p (stmt_info))
{
stmt_cost = 0;
m_stores_to_vector_load_decl = true;
}

return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
}

Expand Down Expand Up @@ -17196,12 +17249,21 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)

/* Apply the heuristic described above m_stp_sequence_cost. Prefer
the scalar code in the event of a tie, since there is more chance
of scalar code being optimized with surrounding operations. */
of scalar code being optimized with surrounding operations.

In addition, if the vector body is a simple store to a decl that
is elsewhere loaded using vld1, strongly prefer the vector form,
to the extent of giving the prologue a zero cost. See the comment
above m_stores_to_vector_load_decl for details. */
if (!loop_vinfo
&& scalar_costs
&& m_stp_sequence_cost != ~0U
&& m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
m_costs[vect_body] = 2 * scalar_costs->total_cost ();
&& m_stp_sequence_cost != ~0U)
{
if (m_stores_to_vector_load_decl)
m_costs[vect_prologue] = 0;
else if (m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
m_costs[vect_body] = 2 * scalar_costs->total_cost ();
}

vector_costs::finish_cost (scalar_costs);
}
Expand Down
5 changes: 5 additions & 0 deletions gcc/config/aarch64/aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -860,6 +860,7 @@ struct GTY (()) aarch64_frame
bool is_scs_enabled;
};

#ifdef hash_set_h
typedef struct GTY (()) machine_function
{
struct aarch64_frame frame;
Expand All @@ -868,8 +869,12 @@ typedef struct GTY (()) machine_function
/* One entry for each general purpose register. */
rtx call_via[SP_REGNUM];
bool label_is_assembled;
/* A set of all decls that have been passed to a vld1 intrinsic in the
current function. This is used to help guide the vector cost model. */
hash_set<tree> *vector_load_decls;
} machine_function;
#endif
#endif

/* Which ABI to use. */
enum aarch64_abi_type
Expand Down
Loading

0 comments on commit fcb4115

Please sign in to comment.