-
Notifications
You must be signed in to change notification settings - Fork 6.2k
8343689: AArch64: Optimize MulReduction implementation #23181
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
0a62dc3
c9dcc45
ff3a7aa
3fc989b
8c90035
9b4243a
68bbbe2
025d516
df09ab6
ebad6dd
609f78e
d35f108
4593a5d
1d2b981
5b06b63
91cbacc
4aed1f6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -117,6 +117,17 @@ source %{ | |||||||
} | ||||||||
|
||||||||
bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) { | ||||||||
// Do not auto-vectorize these FP operations, neither NEON or SVE/SVE2 support them directly: | ||||||||
// 1. The non_strict_order SVE implementation for 256-bit wide vectors does recursive folding | ||||||||
// and doesn't conform to the JLS, Section Evaluation Order. | ||||||||
// 2. A strictly ordered SVE implementation for 256-bit wide vectors isn't currently | ||||||||
// profitable performance-wise. | ||||||||
// 3. The strictly ordered NEON implementation for 64-bit and 128-bit wide vectors isn't | ||||||||
// profitable performance-wise. | ||||||||
if (opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) { | ||||||||
return false; | ||||||||
} | ||||||||
|
||||||||
if (UseSVE == 0) { | ||||||||
// These operations are not profitable to be vectorized on NEON, because no direct | ||||||||
// NEON instructions support them. But the match rule support for them is profitable for | ||||||||
|
@@ -129,7 +140,6 @@ source %{ | |||||||
// They are not suitable for auto-vectorization because the result would not conform | ||||||||
// to the JLS, Section Evaluation Order. | ||||||||
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF || | ||||||||
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF || | ||||||||
opcode == Op_MulVL) { | ||||||||
return false; | ||||||||
} | ||||||||
|
@@ -195,9 +205,9 @@ source %{ | |||||||
case Op_MulReductionVF: | ||||||||
case Op_MulReductionVI: | ||||||||
case Op_MulReductionVL: | ||||||||
// No vector multiply reduction instructions, but we do | ||||||||
// emit scalar instructions for 64/128-bit vectors. | ||||||||
if (length_in_bytes != 8 && length_in_bytes != 16) { | ||||||||
// No vector multiply reduction instructions, but we do emit ASIMD instructions for | ||||||||
// 64/128-bit vectors. For 256-bit vectors it's a combination of SVE and ASIMD instructions. | ||||||||
if (length_in_bytes < 8 || length_in_bytes > 32) { | ||||||||
return false; | ||||||||
} | ||||||||
break; | ||||||||
|
@@ -2109,56 +2119,122 @@ REDUCE_ADD_FP_PREDICATE(D, D) | |||||||
|
||||||||
// ------------------------------ Vector reduction mul ------------------------- | ||||||||
|
||||||||
instruct reduce_mulI(iRegINoSp dst, iRegIorL2I isrc, vReg vsrc, | ||||||||
vReg tmp1, vReg tmp2) %{ | ||||||||
instruct reduce_mulI_le128b(iRegINoSp dst, iRegIorL2I isrc, vReg vsrc, | ||||||||
vReg tmp1, vReg tmp2) %{ | ||||||||
predicate(Matcher::vector_length_in_bytes(n->in(2)) == 8 || | ||||||||
Matcher::vector_length_in_bytes(n->in(2)) == 16); | ||||||||
match(Set dst (MulReductionVI isrc vsrc)); | ||||||||
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2); | ||||||||
format %{ "reduce_mulI $dst, $isrc, $vsrc\t# vector (64/128 bits). KILL $tmp1, $tmp2" %} | ||||||||
format %{ "reduce_mulI_le128b $dst, $isrc, $vsrc\t# vector (64/128 bits). KILL $tmp1, $tmp2" %} | ||||||||
ins_encode %{ | ||||||||
BasicType bt = Matcher::vector_element_basic_type(this, $vsrc); | ||||||||
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc); | ||||||||
__ reduce_mul_integral_le128b($dst$$Register, bt, $isrc$$Register, | ||||||||
$vsrc$$FloatRegister, length_in_bytes, | ||||||||
$tmp1$$FloatRegister, $tmp2$$FloatRegister); | ||||||||
%} | ||||||||
ins_pipe(pipe_slow); | ||||||||
%} | ||||||||
|
||||||||
instruct reduce_mulI_256b(iRegINoSp dst, iRegIorL2I isrc, vReg vsrc, | ||||||||
vReg tmp1, vReg tmp2, vReg tmp3) %{ | ||||||||
predicate(Matcher::vector_length_in_bytes(n->in(2)) == 32); | ||||||||
match(Set dst (MulReductionVI isrc vsrc)); | ||||||||
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3); | ||||||||
format %{ "reduce_mulI_256b $dst, $isrc, $vsrc\t# vector (256 bits). KILL $tmp1, $tmp2, $tmp3" %} | ||||||||
ins_encode %{ | ||||||||
assert(UseSVE > 0, "must be sve"); | ||||||||
BasicType bt = Matcher::vector_element_basic_type(this, $vsrc); | ||||||||
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc); | ||||||||
__ neon_reduce_mul_integral($dst$$Register, bt, $isrc$$Register, | ||||||||
assert(length_in_bytes == MaxVectorSize, "invalid vector length"); | ||||||||
__ reduce_mul_integral_256b($dst$$Register, bt, $isrc$$Register, | ||||||||
$vsrc$$FloatRegister, length_in_bytes, | ||||||||
$tmp1$$FloatRegister, $tmp2$$FloatRegister); | ||||||||
$tmp1$$FloatRegister, $tmp2$$FloatRegister, $tmp3$$FloatRegister); | ||||||||
%} | ||||||||
ins_pipe(pipe_slow); | ||||||||
%} | ||||||||
|
||||||||
instruct reduce_mulL(iRegLNoSp dst, iRegL isrc, vReg vsrc) %{ | ||||||||
instruct reduce_mulL_128b(iRegLNoSp dst, iRegL isrc, vReg vsrc) %{ | ||||||||
predicate(Matcher::vector_length_in_bytes(n->in(2)) == 16); | ||||||||
match(Set dst (MulReductionVL isrc vsrc)); | ||||||||
effect(TEMP_DEF dst); | ||||||||
format %{ "reduce_mulL $dst, $isrc, $vsrc\t# 2L" %} | ||||||||
format %{ "reduce_mulL_128b $dst, $isrc, $vsrc\t# 2L" %} | ||||||||
ins_encode %{ | ||||||||
__ neon_reduce_mul_integral($dst$$Register, T_LONG, $isrc$$Register, | ||||||||
$vsrc$$FloatRegister, 16, fnoreg, fnoreg); | ||||||||
__ reduce_mul_integral_le128b($dst$$Register, T_LONG, $isrc$$Register, $vsrc$$FloatRegister, 16, | ||||||||
fnoreg, fnoreg); | ||||||||
%} | ||||||||
ins_pipe(pipe_slow); | ||||||||
%} | ||||||||
|
||||||||
instruct reduce_mulF(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{ | ||||||||
instruct reduce_mulL_256b(iRegLNoSp dst, iRegL isrc, vReg vsrc, vReg tmp1) %{ | ||||||||
predicate(Matcher::vector_length_in_bytes(n->in(2)) == 32); | ||||||||
match(Set dst (MulReductionVL isrc vsrc)); | ||||||||
effect(TEMP_DEF dst, TEMP tmp1); | ||||||||
format %{ "reduce_mulL_256b $dst, $isrc, $vsrc\t# 4L. KILL $tmp1" %} | ||||||||
ins_encode %{ | ||||||||
assert(UseSVE > 0, "must be sve"); | ||||||||
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc); | ||||||||
assert(length_in_bytes == MaxVectorSize, "invalid vector length"); | ||||||||
__ reduce_mul_integral_256b($dst$$Register, T_LONG, $isrc$$Register, | ||||||||
$vsrc$$FloatRegister, length_in_bytes, | ||||||||
$tmp1$$FloatRegister, fnoreg, fnoreg); | ||||||||
%} | ||||||||
ins_pipe(pipe_slow); | ||||||||
%} | ||||||||
|
||||||||
instruct reduce_mulF_le128b(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{ | ||||||||
predicate(Matcher::vector_length_in_bytes(n->in(2)) <= 16); | ||||||||
match(Set dst (MulReductionVF fsrc vsrc)); | ||||||||
effect(TEMP_DEF dst, TEMP tmp); | ||||||||
format %{ "reduce_mulF $dst, $fsrc, $vsrc\t# 2F/4F. KILL $tmp" %} | ||||||||
format %{ "reduce_mulF_le128b $dst, $fsrc, $vsrc\t# 2F/4F. KILL $tmp" %} | ||||||||
ins_encode %{ | ||||||||
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc); | ||||||||
__ neon_reduce_mul_fp($dst$$FloatRegister, T_FLOAT, $fsrc$$FloatRegister, | ||||||||
$vsrc$$FloatRegister, length_in_bytes, $tmp$$FloatRegister); | ||||||||
__ reduce_mul_fp_le128b($dst$$FloatRegister, T_FLOAT, $fsrc$$FloatRegister, | ||||||||
$vsrc$$FloatRegister, length_in_bytes, $tmp$$FloatRegister); | ||||||||
%} | ||||||||
ins_pipe(pipe_slow); | ||||||||
%} | ||||||||
|
||||||||
instruct reduce_non_strict_order_mulF_256b(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp1, vReg tmp2) %{ | ||||||||
predicate(Matcher::vector_length_in_bytes(n->in(2)) == 32 && !n->as_Reduction()->requires_strict_order()); | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
match(Set dst (MulReductionVF fsrc vsrc)); | ||||||||
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2); | ||||||||
format %{ "reduce_non_strict_order_mulF_256b $dst, $fsrc, $vsrc\t# 8F. KILL $tmp1, $tmp2" %} | ||||||||
ins_encode %{ | ||||||||
assert(UseSVE > 0, "must be sve"); | ||||||||
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc); | ||||||||
assert(length_in_bytes == MaxVectorSize, "invalid vector length"); | ||||||||
__ reduce_non_strict_order_mul_fp_256b($dst$$FloatRegister, T_FLOAT, $fsrc$$FloatRegister, | ||||||||
$vsrc$$FloatRegister, length_in_bytes, $tmp1$$FloatRegister, | ||||||||
$tmp2$$FloatRegister); | ||||||||
%} | ||||||||
ins_pipe(pipe_slow); | ||||||||
%} | ||||||||
|
||||||||
instruct reduce_mulD(vRegD dst, vRegD dsrc, vReg vsrc, vReg tmp) %{ | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please consider that |
||||||||
instruct reduce_mulD_128b(vRegD dst, vRegD dsrc, vReg vsrc, vReg tmp) %{ | ||||||||
predicate(Matcher::vector_length_in_bytes(n->in(2)) == 16); | ||||||||
match(Set dst (MulReductionVD dsrc vsrc)); | ||||||||
effect(TEMP_DEF dst, TEMP tmp); | ||||||||
format %{ "reduce_mulD $dst, $dsrc, $vsrc\t# 2D. KILL $tmp" %} | ||||||||
format %{ "reduce_mulD_128b $dst, $dsrc, $vsrc\t# 2D. KILL $tmp" %} | ||||||||
ins_encode %{ | ||||||||
__ neon_reduce_mul_fp($dst$$FloatRegister, T_DOUBLE, $dsrc$$FloatRegister, | ||||||||
$vsrc$$FloatRegister, 16, $tmp$$FloatRegister); | ||||||||
__ reduce_mul_fp_le128b($dst$$FloatRegister, T_DOUBLE, $dsrc$$FloatRegister, | ||||||||
$vsrc$$FloatRegister, 16, $tmp$$FloatRegister); | ||||||||
%} | ||||||||
ins_pipe(pipe_slow); | ||||||||
%} | ||||||||
|
||||||||
instruct reduce_non_strict_order_mulD_256b(vRegD dst, vRegD dsrc, vReg vsrc, vReg tmp1, vReg tmp2) %{ | ||||||||
predicate(Matcher::vector_length_in_bytes(n->in(2)) == 32 && !n->as_Reduction()->requires_strict_order()); | ||||||||
match(Set dst (MulReductionVD dsrc vsrc)); | ||||||||
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2); | ||||||||
format %{ "reduce_non_strict_order_mulD_256b $dst, $dsrc, $vsrc\t# 4D. KILL $tmp1, $tmp2" %} | ||||||||
ins_encode %{ | ||||||||
assert(UseSVE > 0, "must be sve"); | ||||||||
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc); | ||||||||
assert(length_in_bytes == MaxVectorSize, "invalid vector length"); | ||||||||
__ reduce_non_strict_order_mul_fp_256b($dst$$FloatRegister, T_DOUBLE, $dsrc$$FloatRegister, | ||||||||
$vsrc$$FloatRegister, length_in_bytes, $tmp1$$FloatRegister, | ||||||||
$tmp2$$FloatRegister); | ||||||||
%} | ||||||||
ins_pipe(pipe_slow); | ||||||||
%} | ||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4064,6 +4064,15 @@ template<typename R, typename... Rx> | |
INSN(sve_brkb, 0b10); // Break before first true condition | ||
#undef INSN | ||
|
||
// SVE Integer Misc - Unpredicated | ||
|
||
// SVE constructive prefix (unpredicated) | ||
void sve_movprfx(FloatRegister Zd, FloatRegister Zn) { | ||
starti; | ||
f(0b00000100, 31, 24), f(0b00, 23, 22), f(0b1, 21), f(0b00000, 20, 16); | ||
f(0b101111, 15, 10), rf(Zn, 5), rf(Zd, 0); | ||
} | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This pattern should be in a section SVE Integer Reduction, C4.1.37. I'm not sure if any other instructions in that group are defined yet, but if not please start the section. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, the unpredicated version should be in the SVE Integer Misc - Unpredicated section. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are you asking to move it to another existing section in the file or create a new one? If it's the former, could you point me to the section in the file - I can see neither If it's something else completely, please elaborate. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please try to organize things the same way as the Decode section of the ARM. Insert a new section called SVE Integer Misc - Unpredicated after SVE bitwise shift by immediate (predicated) and put this pattern there. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Do you refer to C4: A64 Instruction Set Encoding?
I assume you might have misinterpreted predicated SVE bitwise shift for unpredicated. In the C4: A64 Instruction Set Encoding, C4.1.41 SVE Integer Misc - Unpredicated follows C4.1.40 SVE Bitwise Shift - Unpredicated which is not implemented by There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
It's possible. The point is to make sure that any new instruction is in a section corresponding to its section in hte Decoding tables. Please make your best guess as to where that should be, and we'll discuss it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To (at least partially) conform to the ordering in C4: A64 Instruction Set Encoding, it should be placed either right after SVE stack frame adjustment or right before SVE element count as described above. The patch does the latter. I've started the section, please check 4aed1f6 and resolve the thread if you find it suitable. |
||
// Element count and increment scalar (SVE) | ||
#define INSN(NAME, TYPE) \ | ||
void NAME(Register Xdn, unsigned imm4 = 1, int pattern = 0b11111) { \ | ||
|
Uh oh!
There was an error while loading. Please reload this page.