-
Notifications
You must be signed in to change notification settings - Fork 6.1k
8302652: [SuperWord] Reduction should happen after loop, when possible #13056
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7f9641e
7b551b6
83988de
e108ae5
9391f99
3a9ba97
ed9e788
9b01aea
3130bf3
02546bd
a9dbffe
cc9e7e8
5a51ac3
56990bd
72fa58e
31d977c
0a72f4c
9291fb3
e1af096
e3d99c9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -41,6 +41,7 @@ | |||||||
#include "opto/rootnode.hpp" | ||||||||
#include "opto/subnode.hpp" | ||||||||
#include "opto/subtypenode.hpp" | ||||||||
#include "opto/vectornode.hpp" | ||||||||
#include "utilities/macros.hpp" | ||||||||
|
||||||||
//============================================================================= | ||||||||
|
@@ -4120,3 +4121,188 @@ bool PhaseIdealLoop::duplicate_loop_backedge(IdealLoopTree *loop, Node_List &old | |||||||
|
||||||||
return true; | ||||||||
} | ||||||||
|
||||||||
// Having ReductionNodes in the loop is expensive. They need to recursively | ||||||||
// fold together the vector values, for every vectorized loop iteration. If | ||||||||
// we encounter the following pattern, we can vector accumulate the values | ||||||||
// inside the loop, and only have a single UnorderedReduction after the loop. | ||||||||
// | ||||||||
// CountedLoop init | ||||||||
// | | | ||||||||
// +------+ | +-----------------------+ | ||||||||
// | | | | | ||||||||
// PhiNode (s) | | ||||||||
// | | | ||||||||
// | Vector | | ||||||||
// | | | | ||||||||
// UnorderedReduction (first_ur) | | ||||||||
// | | | ||||||||
// ... Vector | | ||||||||
// | | | | ||||||||
// UnorderedReduction (last_ur) | | ||||||||
// | | | ||||||||
// +---------------------+ | ||||||||
// | ||||||||
// We patch the graph to look like this: | ||||||||
// | ||||||||
// CountedLoop identity_vector | ||||||||
// | | | ||||||||
// +-------+ | +---------------+ | ||||||||
// | | | | | ||||||||
// PhiNode (v) | | ||||||||
// | | | ||||||||
// | Vector | | ||||||||
// | | | | ||||||||
// VectorAccumulator | | ||||||||
// | | | ||||||||
// ... Vector | | ||||||||
// | | | | ||||||||
// init VectorAccumulator | | ||||||||
// | | | | | ||||||||
// UnorderedReduction +-----------+ | ||||||||
// | ||||||||
// We turned the scalar (s) Phi into a vectorized one (v). In the loop, we | ||||||||
// use vector_accumulators, which do the same reductions, but only element | ||||||||
// wise. This is a single operation per vector_accumulator, rather than many | ||||||||
// for a UnorderedReduction. We can then reduce the last vector_accumulator | ||||||||
// after the loop, and also reduce the init value into it. | ||||||||
// We can not do this with all reductions. Some reductions do not allow the | ||||||||
// reordering of operations (for example float addition). | ||||||||
void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { | ||||||||
assert(!C->major_progress() && loop->is_counted() && loop->is_innermost(), "sanity"); | ||||||||
|
||||||||
// Find all Phi nodes with UnorderedReduction on backedge. | ||||||||
CountedLoopNode* cl = loop->_head->as_CountedLoop(); | ||||||||
for (DUIterator_Fast jmax, j = cl->fast_outs(jmax); j < jmax; j++) { | ||||||||
Node* phi = cl->fast_out(j); | ||||||||
// We have a phi with a single use, and a UnorderedReduction on the backedge. | ||||||||
if (!phi->is_Phi() || phi->outcnt() != 1 || !phi->in(2)->is_UnorderedReduction()) { | ||||||||
continue; | ||||||||
} | ||||||||
|
||||||||
UnorderedReductionNode* last_ur = phi->in(2)->as_UnorderedReduction(); | ||||||||
|
||||||||
// Determine types | ||||||||
const TypeVect* vec_t = last_ur->vect_type(); | ||||||||
uint vector_length = vec_t->length(); | ||||||||
BasicType bt = vec_t->element_basic_type(); | ||||||||
const Type* bt_t = Type::get_const_basic_type(bt); | ||||||||
|
||||||||
// Convert opcode from vector-reduction -> scalar -> normal-vector-op | ||||||||
const int sopc = VectorNode::scalar_opcode(last_ur->Opcode(), bt); | ||||||||
const int vopc = VectorNode::opcode(sopc, bt); | ||||||||
if (!Matcher::match_rule_supported_vector(vopc, vector_length, bt)) { | ||||||||
DEBUG_ONLY( last_ur->dump(); ) | ||||||||
assert(false, "do not have normal vector op for this reduction"); | ||||||||
continue; // not implemented -> fails | ||||||||
} | ||||||||
|
||||||||
// Traverse up the chain of UnorderedReductions, checking that it loops back to | ||||||||
// the phi. Check that all UnorderedReductions only have a single use, except for | ||||||||
// the last (last_ur), which only has phi as a use in the loop, and all other uses | ||||||||
// are outside the loop. | ||||||||
UnorderedReductionNode* current = last_ur; | ||||||||
UnorderedReductionNode* first_ur = nullptr; | ||||||||
while (true) { | ||||||||
assert(current->is_UnorderedReduction(), "sanity"); | ||||||||
|
||||||||
// Expect no ctrl and a vector_input from within the loop. | ||||||||
Node* ctrl = current->in(0); | ||||||||
Node* vector_input = current->in(2); | ||||||||
if (ctrl != nullptr || get_ctrl(vector_input) != cl) { | ||||||||
DEBUG_ONLY( current->dump(1); ) | ||||||||
assert(false, "reduction has ctrl or bad vector_input"); | ||||||||
break; // Chain traversal fails. | ||||||||
} | ||||||||
|
||||||||
// Expect single use of UnorderedReduction, except for last_ur. | ||||||||
if (current == last_ur) { | ||||||||
// Expect all uses to be outside the loop, except phi. | ||||||||
for (DUIterator_Fast kmax, k = current->fast_outs(kmax); k < kmax; k++) { | ||||||||
Node* use = current->fast_out(k); | ||||||||
if (use != phi && ctrl_or_self(use) == cl) { | ||||||||
DEBUG_ONLY( current->dump(-1); ) | ||||||||
assert(false, "reduction has use inside loop"); | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have been wondering, it is right to bailout here from the optimization but why do we assert here? It is perfectly legal (if not very meaningful) to have a scalar use of the last unordered reduction within the loop. This will still auto vectorize as the reduction is to a scalar. e.g. a slight modification of the SumRed_Int.java still auto vectorizes and has a use of the last unordered reduction within the loop: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree, the assert is not very necessary, but I'd rather have an assert more in there and figure out what cases I missed when the fuzzer eventually finds a case. But if it is wished I can also just remove that assert. I wrote this
And ran it like this, with my patch:
Everything vectorized as usual. But what happens with the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note: If I have uses of the reduction in each iteration, then we already refuse to vectorize the reduction, as in this case:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My conclusion, given my best understanding: eigher we have a use of the So if there is such an odd example, I'd rather we run into an assert in debug and look at it again. Maybe it would be perfectly legal, or maybe it reveals a bug here or elsewhere in the reduction code. @sviswa7 what do you think? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, but this hits one of my asserts:
With
Triggers jdk/src/hotspot/share/opto/loopopts.cpp Line 4217 in 31d977c
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will add this as a regression test, and remove that assert. Thanks @sviswa7 for making me look at this more closely :) Still, I think it may be valuable to keep these two asserts - both indicate that something strange has happened: jdk/src/hotspot/share/opto/loopopts.cpp Line 4210 in 31d977c
jdk/src/hotspot/share/opto/loopopts.cpp Line 4199 in 31d977c
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds good to me. |
||||||||
break; // Chain traversal fails. | ||||||||
} | ||||||||
} | ||||||||
} else { | ||||||||
if (current->outcnt() != 1) { | ||||||||
break; // Chain traversal fails. | ||||||||
} | ||||||||
} | ||||||||
|
||||||||
// Expect another UnorderedReduction or phi as the scalar input. | ||||||||
Node* scalar_input = current->in(1); | ||||||||
if (scalar_input->is_UnorderedReduction() && | ||||||||
scalar_input->Opcode() == current->Opcode()) { | ||||||||
// Move up the UnorderedReduction chain. | ||||||||
current = scalar_input->as_UnorderedReduction(); | ||||||||
} else if (scalar_input == phi) { | ||||||||
// Chain terminates at phi. | ||||||||
first_ur = current; | ||||||||
current = nullptr; | ||||||||
break; // Success. | ||||||||
} else { | ||||||||
DEBUG_ONLY( current->dump(1); ) | ||||||||
assert(false, "scalar_input is neither phi nor a matchin reduction"); | ||||||||
break; // Chain traversal fails. | ||||||||
} | ||||||||
} | ||||||||
if (current != nullptr) { | ||||||||
// Chain traversal was not successful. | ||||||||
continue; | ||||||||
} | ||||||||
assert(first_ur != nullptr, "must have successfully terminated chain traversal"); | ||||||||
|
||||||||
Node* identity_scalar = ReductionNode::make_identity_con_scalar(_igvn, sopc, bt); | ||||||||
set_ctrl(identity_scalar, C->root()); | ||||||||
VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector_length, bt_t); | ||||||||
register_new_node(identity_vector, C->root()); | ||||||||
assert(vec_t == identity_vector->vect_type(), "matching vector type"); | ||||||||
VectorNode::trace_new_vector(identity_vector, "UnorderedReduction"); | ||||||||
|
||||||||
// Turn the scalar phi into a vector phi. | ||||||||
_igvn.rehash_node_delayed(phi); | ||||||||
Node* init = phi->in(1); // Remember init before replacing it. | ||||||||
phi->set_req_X(1, identity_vector, &_igvn); | ||||||||
phi->as_Type()->set_type(vec_t); | ||||||||
_igvn.set_type(phi, vec_t); | ||||||||
|
||||||||
// Traverse down the chain of UnorderedReductions, and replace them with vector_accumulators. | ||||||||
current = first_ur; | ||||||||
while (true) { | ||||||||
// Create vector_accumulator to replace current. | ||||||||
Node* last_vector_accumulator = current->in(1); | ||||||||
Node* vector_input = current->in(2); | ||||||||
VectorNode* vector_accumulator = VectorNode::make(vopc, last_vector_accumulator, vector_input, vec_t); | ||||||||
register_new_node(vector_accumulator, cl); | ||||||||
_igvn.replace_node(current, vector_accumulator); | ||||||||
VectorNode::trace_new_vector(vector_accumulator, "UnorderedReduction"); | ||||||||
if (current == last_ur) { | ||||||||
break; | ||||||||
} | ||||||||
current = vector_accumulator->unique_out()->as_UnorderedReduction(); | ||||||||
} | ||||||||
|
||||||||
// Create post-loop reduction. | ||||||||
Node* last_accumulator = phi->in(2); | ||||||||
Node* post_loop_reduction = ReductionNode::make(sopc, nullptr, init, last_accumulator, bt); | ||||||||
|
||||||||
// Take over uses of last_accumulator that are not in the loop. | ||||||||
for (DUIterator i = last_accumulator->outs(); last_accumulator->has_out(i); i++) { | ||||||||
Node* use = last_accumulator->out(i); | ||||||||
if (use != phi && use != post_loop_reduction) { | ||||||||
assert(ctrl_or_self(use) != cl, "use must be outside loop"); | ||||||||
use->replace_edge(last_accumulator, post_loop_reduction, &_igvn); | ||||||||
--i; | ||||||||
} | ||||||||
} | ||||||||
register_new_node(post_loop_reduction, get_late_ctrl(post_loop_reduction, cl)); | ||||||||
VectorNode::trace_new_vector(post_loop_reduction, "UnorderedReduction"); | ||||||||
|
||||||||
assert(last_accumulator->outcnt() == 2, "last_accumulator has 2 uses: phi and post_loop_reduction"); | ||||||||
assert(post_loop_reduction->outcnt() > 0, "should have taken over all non loop uses of last_accumulator"); | ||||||||
assert(phi->outcnt() == 1, "accumulator is the only use of phi"); | ||||||||
} | ||||||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Other changes looks good to me, can you rename VectorNode::scalar_opcode to ReductionNode::scalar_opcode
, also move out vector opcode cases into a separate vector-to-scalar mapping routine if needed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it not better to have
VectorNode::scalar_opcode
? It is more general - maybe it is useful in the future.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not a blocker, but we intend to get a scalar opcode for ReductionNode, we have different factory method for Vector/Reduction Nodes, you can keep it for now
Best Regards,
Jatin
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jatin-bhateja I see your point. On the other hand, we would have quite some code duplication handling all the BasicType cases for every operation. I'll leave it the way I have it now, and we can still reconsider it if we want to in the future.