Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
8298244: AArch64: Optimize vector implementation of AddReduction for …
…floating point

Reviewed-by: aph, xgong
  • Loading branch information
Fei Gao authored and Ningsheng Jian committed Dec 19, 2022
1 parent 7938f8c commit ba942c2
Show file tree
Hide file tree
Showing 5 changed files with 496 additions and 468 deletions.
51 changes: 31 additions & 20 deletions src/hotspot/cpu/aarch64/aarch64_vector.ad
Expand Up @@ -134,6 +134,9 @@ source %{
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
// The vector implementation of Op_AddReductionVD/F is for the Vector API only.
// It is not suitable for auto-vectorization because it does not add the elements
// in the same order as sequential code, and FP addition is non-associative.
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
opcode == Op_MulVL) {
Expand Down Expand Up @@ -2876,23 +2879,30 @@ instruct reduce_addL_sve(iRegLNoSp dst, iRegL isrc, vReg vsrc, vRegD tmp) %{
%}

// reduction addF
// Floating-point addition is not associative, so the rules for AddReductionVF
// on NEON can't be used to auto-vectorize floating-point reduce-add.
// Currently, on NEON, AddReductionVF is only generated by Vector API.
instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 2);
match(Set dst (AddReductionVF fsrc vsrc));
effect(TEMP_DEF dst);
format %{ "reduce_add2F_neon $dst, $fsrc, $vsrc" %}
ins_encode %{
__ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ S);
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}

instruct reduce_addF_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
predicate(UseSVE == 0);
instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 4);
match(Set dst (AddReductionVF fsrc vsrc));
effect(TEMP_DEF dst, TEMP tmp);
format %{ "reduce_addF_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
format %{ "reduce_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
ins_encode %{
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
__ fadds($dst$$FloatRegister, $fsrc$$FloatRegister, $vsrc$$FloatRegister);
__ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 1);
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $tmp$$FloatRegister);
if (length_in_bytes == 16) {
__ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 2);
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $tmp$$FloatRegister);
__ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 3);
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $tmp$$FloatRegister);
}
__ faddp($tmp$$FloatRegister, __ T4S, $vsrc$$FloatRegister, $vsrc$$FloatRegister);
__ faddp($dst$$FloatRegister, $tmp$$FloatRegister, __ S);
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}
Expand All @@ -2910,16 +2920,17 @@ instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{
%}

// reduction addD

instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc, vReg tmp) %{
// Floating-point addition is not associative, so the rule for AddReductionVD
// on NEON can't be used to auto-vectorize floating-point reduce-add.
// Currently, on NEON, AddReductionVD is only generated by Vector API.
instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
predicate(UseSVE == 0);
match(Set dst (AddReductionVD dsrc vsrc));
effect(TEMP_DEF dst, TEMP tmp);
format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D. KILL $tmp" %}
effect(TEMP_DEF dst);
format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D" %}
ins_encode %{
__ faddd($dst$$FloatRegister, $dsrc$$FloatRegister, $vsrc$$FloatRegister);
__ ins($tmp$$FloatRegister, __ D, $vsrc$$FloatRegister, 0, 1);
__ faddd($dst$$FloatRegister, $dst$$FloatRegister, $tmp$$FloatRegister);
__ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ D);
__ faddd($dst$$FloatRegister, $dst$$FloatRegister, $dsrc$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}
Expand Down
51 changes: 31 additions & 20 deletions src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
Expand Up @@ -124,6 +124,9 @@ source %{
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
// The vector implementation of Op_AddReductionVD/F is for the Vector API only.
// It is not suitable for auto-vectorization because it does not add the elements
// in the same order as sequential code, and FP addition is non-associative.
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
opcode == Op_MulVL) {
Expand Down Expand Up @@ -1808,23 +1811,30 @@ REDUCE_ADD_INT_NEON_SVE_PAIRWISE(I, iRegIorL2I)
REDUCE_ADD_INT_NEON_SVE_PAIRWISE(L, iRegL)

// reduction addF
// Floating-point addition is not associative, so the rules for AddReductionVF
// on NEON can't be used to auto-vectorize floating-point reduce-add.
// Currently, on NEON, AddReductionVF is only generated by Vector API.
instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 2);
match(Set dst (AddReductionVF fsrc vsrc));
effect(TEMP_DEF dst);
format %{ "reduce_add2F_neon $dst, $fsrc, $vsrc" %}
ins_encode %{
__ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ S);
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}

instruct reduce_addF_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
predicate(UseSVE == 0);
instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 4);
match(Set dst (AddReductionVF fsrc vsrc));
effect(TEMP_DEF dst, TEMP tmp);
format %{ "reduce_addF_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
format %{ "reduce_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
ins_encode %{
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
__ fadds($dst$$FloatRegister, $fsrc$$FloatRegister, $vsrc$$FloatRegister);
__ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 1);
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $tmp$$FloatRegister);
if (length_in_bytes == 16) {
__ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 2);
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $tmp$$FloatRegister);
__ ins($tmp$$FloatRegister, __ S, $vsrc$$FloatRegister, 0, 3);
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $tmp$$FloatRegister);
}
__ faddp($tmp$$FloatRegister, __ T4S, $vsrc$$FloatRegister, $vsrc$$FloatRegister);
__ faddp($dst$$FloatRegister, $tmp$$FloatRegister, __ S);
__ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}
Expand All @@ -1847,16 +1857,17 @@ dnl
REDUCE_ADD_FP_SVE(F, S)

// reduction addD

instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc, vReg tmp) %{
// Floating-point addition is not associative, so the rule for AddReductionVD
// on NEON can't be used to auto-vectorize floating-point reduce-add.
// Currently, on NEON, AddReductionVD is only generated by Vector API.
instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
predicate(UseSVE == 0);
match(Set dst (AddReductionVD dsrc vsrc));
effect(TEMP_DEF dst, TEMP tmp);
format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D. KILL $tmp" %}
effect(TEMP_DEF dst);
format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D" %}
ins_encode %{
__ faddd($dst$$FloatRegister, $dsrc$$FloatRegister, $vsrc$$FloatRegister);
__ ins($tmp$$FloatRegister, __ D, $vsrc$$FloatRegister, 0, 1);
__ faddd($dst$$FloatRegister, $dst$$FloatRegister, $tmp$$FloatRegister);
__ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ D);
__ faddd($dst$$FloatRegister, $dst$$FloatRegister, $dsrc$$FloatRegister);
%}
ins_pipe(pipe_slow);
%}
Expand Down
1 change: 1 addition & 0 deletions src/hotspot/cpu/aarch64/assembler_aarch64.hpp
Expand Up @@ -2716,6 +2716,7 @@ template<typename R, typename... Rx>
INSN(fabd, 1, 1, 0b110101);
INSN(fadd, 0, 0, 0b110101);
INSN(fdiv, 1, 0, 0b111111);
INSN(faddp, 1, 0, 0b110101);
INSN(fmul, 1, 0, 0b110111);
INSN(fsub, 0, 1, 0b110101);
INSN(fmla, 0, 0, 0b110011);
Expand Down
2 changes: 2 additions & 0 deletions test/hotspot/gtest/aarch64/aarch64-asmtest.py
Expand Up @@ -1564,6 +1564,8 @@ def generate(kind, names):
["mulv", "mul", "2S"], ["mulv", "mul", "4S"],
["fabd", "fabd", "2S"], ["fabd", "fabd", "4S"],
["fabd", "fabd", "2D"],
["faddp", "faddp", "2S"], ["faddp", "faddp", "4S"],
["faddp", "faddp", "2D"],
["fmul", "fmul", "2S"], ["fmul", "fmul", "4S"],
["fmul", "fmul", "2D"],
["mlav", "mla", "4H"], ["mlav", "mla", "8H"],
Expand Down

1 comment on commit ba942c2

@openjdk-notifier
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.