Navigation Menu

Skip to content

Commit

Permalink
8302652: [SuperWord] Reduction should happen after loop, when possible
Browse files Browse the repository at this point in the history
Reviewed-by: kvn, pli, jbhateja, sviswanathan
  • Loading branch information
eme64 committed May 23, 2023
1 parent 69f508a commit 06b0a5e
Show file tree
Hide file tree
Showing 16 changed files with 1,032 additions and 239 deletions.
7 changes: 1 addition & 6 deletions src/hotspot/share/opto/compile.cpp
Expand Up @@ -2844,12 +2844,7 @@ void Compile::process_logic_cone_root(PhaseIterGVN &igvn, Node *n, VectorSet &vi
if (mask == nullptr ||
Matcher::match_rule_supported_vector_masked(Op_MacroLogicV, vt->length(), vt->element_basic_type())) {
Node* macro_logic = xform_to_MacroLogicV(igvn, vt, partition, inputs);
#ifdef ASSERT
if (TraceNewVectors) {
tty->print("new Vector node: ");
macro_logic->dump();
}
#endif
VectorNode::trace_new_vector(macro_logic, "MacroLogic");
igvn.replace_node(n, macro_logic);
}
}
Expand Down
10 changes: 10 additions & 0 deletions src/hotspot/share/opto/loopnode.cpp
Expand Up @@ -4634,6 +4634,16 @@ void PhaseIdealLoop::build_and_optimize() {
}
}
}

// Move UnorderedReduction out of counted loop. Can be introduced by SuperWord.
if (C->has_loops() && !C->major_progress()) {
for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
IdealLoopTree* lpt = iter.current();
if (lpt->is_counted() && lpt->is_innermost()) {
move_unordered_reduction_out_of_loop(lpt);
}
}
}
}

#ifndef PRODUCT
Expand Down
3 changes: 3 additions & 0 deletions src/hotspot/share/opto/loopnode.hpp
Expand Up @@ -1486,6 +1486,9 @@ class PhaseIdealLoop : public PhaseTransform {
bool partial_peel( IdealLoopTree *loop, Node_List &old_new );
bool duplicate_loop_backedge(IdealLoopTree *loop, Node_List &old_new);

// Move UnorderedReduction out of loop if possible
void move_unordered_reduction_out_of_loop(IdealLoopTree* loop);

// Create a scheduled list of nodes control dependent on ctrl set.
void scheduled_nodelist( IdealLoopTree *loop, VectorSet& ctrl, Node_List &sched );
// Has a use in the vector set
Expand Down
186 changes: 186 additions & 0 deletions src/hotspot/share/opto/loopopts.cpp
Expand Up @@ -41,6 +41,7 @@
#include "opto/rootnode.hpp"
#include "opto/subnode.hpp"
#include "opto/subtypenode.hpp"
#include "opto/vectornode.hpp"
#include "utilities/macros.hpp"

//=============================================================================
Expand Down Expand Up @@ -4120,3 +4121,188 @@ bool PhaseIdealLoop::duplicate_loop_backedge(IdealLoopTree *loop, Node_List &old

return true;
}

// Having ReductionNodes in the loop is expensive. They need to recursively
// fold together the vector values, for every vectorized loop iteration. If
// we encounter the following pattern, we can vector accumulate the values
// inside the loop, and only have a single UnorderedReduction after the loop.
//
// CountedLoop init
// | |
// +------+ | +-----------------------+
// | | | |
// PhiNode (s) |
// | |
// | Vector |
// | | |
// UnorderedReduction (first_ur) |
// | |
// ... Vector |
// | | |
// UnorderedReduction (last_ur) |
// | |
// +---------------------+
//
// We patch the graph to look like this:
//
// CountedLoop identity_vector
// | |
// +-------+ | +---------------+
// | | | |
// PhiNode (v) |
// | |
// | Vector |
// | | |
// VectorAccumulator |
// | |
// ... Vector |
// | | |
// init VectorAccumulator |
// | | | |
// UnorderedReduction +-----------+
//
// We turned the scalar (s) Phi into a vectorized one (v). In the loop, we
// use vector_accumulators, which do the same reductions, but only element
// wise. This is a single operation per vector_accumulator, rather than many
// for a UnorderedReduction. We can then reduce the last vector_accumulator
// after the loop, and also reduce the init value into it.
// We can not do this with all reductions. Some reductions do not allow the
// reordering of operations (for example float addition).
void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
assert(!C->major_progress() && loop->is_counted() && loop->is_innermost(), "sanity");

// Find all Phi nodes with UnorderedReduction on backedge.
CountedLoopNode* cl = loop->_head->as_CountedLoop();
for (DUIterator_Fast jmax, j = cl->fast_outs(jmax); j < jmax; j++) {
Node* phi = cl->fast_out(j);
// We have a phi with a single use, and a UnorderedReduction on the backedge.
if (!phi->is_Phi() || phi->outcnt() != 1 || !phi->in(2)->is_UnorderedReduction()) {
continue;
}

UnorderedReductionNode* last_ur = phi->in(2)->as_UnorderedReduction();

// Determine types
const TypeVect* vec_t = last_ur->vect_type();
uint vector_length = vec_t->length();
BasicType bt = vec_t->element_basic_type();
const Type* bt_t = Type::get_const_basic_type(bt);

// Convert opcode from vector-reduction -> scalar -> normal-vector-op
const int sopc = VectorNode::scalar_opcode(last_ur->Opcode(), bt);
const int vopc = VectorNode::opcode(sopc, bt);
if (!Matcher::match_rule_supported_vector(vopc, vector_length, bt)) {
DEBUG_ONLY( last_ur->dump(); )
assert(false, "do not have normal vector op for this reduction");
continue; // not implemented -> fails
}

// Traverse up the chain of UnorderedReductions, checking that it loops back to
// the phi. Check that all UnorderedReductions only have a single use, except for
// the last (last_ur), which only has phi as a use in the loop, and all other uses
// are outside the loop.
UnorderedReductionNode* current = last_ur;
UnorderedReductionNode* first_ur = nullptr;
while (true) {
assert(current->is_UnorderedReduction(), "sanity");

// Expect no ctrl and a vector_input from within the loop.
Node* ctrl = current->in(0);
Node* vector_input = current->in(2);
if (ctrl != nullptr || get_ctrl(vector_input) != cl) {
DEBUG_ONLY( current->dump(1); )
assert(false, "reduction has ctrl or bad vector_input");
break; // Chain traversal fails.
}

// Expect single use of UnorderedReduction, except for last_ur.
if (current == last_ur) {
// Expect all uses to be outside the loop, except phi.
for (DUIterator_Fast kmax, k = current->fast_outs(kmax); k < kmax; k++) {
Node* use = current->fast_out(k);
if (use != phi && ctrl_or_self(use) == cl) {
DEBUG_ONLY( current->dump(-1); )
assert(false, "reduction has use inside loop");
break; // Chain traversal fails.
}
}
} else {
if (current->outcnt() != 1) {
break; // Chain traversal fails.
}
}

// Expect another UnorderedReduction or phi as the scalar input.
Node* scalar_input = current->in(1);
if (scalar_input->is_UnorderedReduction() &&
scalar_input->Opcode() == current->Opcode()) {
// Move up the UnorderedReduction chain.
current = scalar_input->as_UnorderedReduction();
} else if (scalar_input == phi) {
// Chain terminates at phi.
first_ur = current;
current = nullptr;
break; // Success.
} else {
DEBUG_ONLY( current->dump(1); )
assert(false, "scalar_input is neither phi nor a matchin reduction");
break; // Chain traversal fails.
}
}
if (current != nullptr) {
// Chain traversal was not successful.
continue;
}
assert(first_ur != nullptr, "must have successfully terminated chain traversal");

Node* identity_scalar = ReductionNode::make_identity_con_scalar(_igvn, sopc, bt);
set_ctrl(identity_scalar, C->root());
VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector_length, bt_t);
register_new_node(identity_vector, C->root());
assert(vec_t == identity_vector->vect_type(), "matching vector type");
VectorNode::trace_new_vector(identity_vector, "UnorderedReduction");

// Turn the scalar phi into a vector phi.
_igvn.rehash_node_delayed(phi);
Node* init = phi->in(1); // Remember init before replacing it.
phi->set_req_X(1, identity_vector, &_igvn);
phi->as_Type()->set_type(vec_t);
_igvn.set_type(phi, vec_t);

// Traverse down the chain of UnorderedReductions, and replace them with vector_accumulators.
current = first_ur;
while (true) {
// Create vector_accumulator to replace current.
Node* last_vector_accumulator = current->in(1);
Node* vector_input = current->in(2);
VectorNode* vector_accumulator = VectorNode::make(vopc, last_vector_accumulator, vector_input, vec_t);
register_new_node(vector_accumulator, cl);
_igvn.replace_node(current, vector_accumulator);
VectorNode::trace_new_vector(vector_accumulator, "UnorderedReduction");
if (current == last_ur) {
break;
}
current = vector_accumulator->unique_out()->as_UnorderedReduction();
}

// Create post-loop reduction.
Node* last_accumulator = phi->in(2);
Node* post_loop_reduction = ReductionNode::make(sopc, nullptr, init, last_accumulator, bt);

// Take over uses of last_accumulator that are not in the loop.
for (DUIterator i = last_accumulator->outs(); last_accumulator->has_out(i); i++) {
Node* use = last_accumulator->out(i);
if (use != phi && use != post_loop_reduction) {
assert(ctrl_or_self(use) != cl, "use must be outside loop");
use->replace_edge(last_accumulator, post_loop_reduction, &_igvn);
--i;
}
}
register_new_node(post_loop_reduction, get_late_ctrl(post_loop_reduction, cl));
VectorNode::trace_new_vector(post_loop_reduction, "UnorderedReduction");

assert(last_accumulator->outcnt() == 2, "last_accumulator has 2 uses: phi and post_loop_reduction");
assert(post_loop_reduction->outcnt() > 0, "should have taken over all non loop uses of last_accumulator");
assert(phi->outcnt() == 1, "accumulator is the only use of phi");
}
}
6 changes: 6 additions & 0 deletions src/hotspot/share/opto/node.hpp
Expand Up @@ -151,6 +151,7 @@ class Pipeline;
class PopulateIndexNode;
class ProjNode;
class RangeCheckNode;
class ReductionNode;
class RegMask;
class RegionNode;
class RootNode;
Expand All @@ -164,6 +165,7 @@ class SubTypeCheckNode;
class Type;
class TypeNode;
class UnlockNode;
class UnorderedReductionNode;
class VectorNode;
class LoadVectorNode;
class LoadVectorMaskedNode;
Expand Down Expand Up @@ -718,6 +720,8 @@ class Node {
DEFINE_CLASS_ID(CompressV, Vector, 4)
DEFINE_CLASS_ID(ExpandV, Vector, 5)
DEFINE_CLASS_ID(CompressM, Vector, 6)
DEFINE_CLASS_ID(Reduction, Vector, 7)
DEFINE_CLASS_ID(UnorderedReduction, Reduction, 0)
DEFINE_CLASS_ID(Con, Type, 8)
DEFINE_CLASS_ID(ConI, Con, 0)

Expand Down Expand Up @@ -941,6 +945,7 @@ class Node {
DEFINE_CLASS_QUERY(PCTable)
DEFINE_CLASS_QUERY(Phi)
DEFINE_CLASS_QUERY(Proj)
DEFINE_CLASS_QUERY(Reduction)
DEFINE_CLASS_QUERY(Region)
DEFINE_CLASS_QUERY(Root)
DEFINE_CLASS_QUERY(SafePoint)
Expand All @@ -950,6 +955,7 @@ class Node {
DEFINE_CLASS_QUERY(Sub)
DEFINE_CLASS_QUERY(SubTypeCheck)
DEFINE_CLASS_QUERY(Type)
DEFINE_CLASS_QUERY(UnorderedReduction)
DEFINE_CLASS_QUERY(Vector)
DEFINE_CLASS_QUERY(VectorMaskCmp)
DEFINE_CLASS_QUERY(VectorUnbox)
Expand Down
29 changes: 5 additions & 24 deletions src/hotspot/share/opto/superword.cpp
Expand Up @@ -3197,12 +3197,7 @@ bool SuperWord::output() {
if (vlen_in_bytes > max_vlen_in_bytes) {
max_vlen_in_bytes = vlen_in_bytes;
}
#ifdef ASSERT
if (TraceNewVectors) {
tty->print("new Vector node: ");
vn->dump();
}
#endif
VectorNode::trace_new_vector(vn, "SuperWord");
}
}//for (int i = 0; i < _block.length(); i++)

Expand Down Expand Up @@ -3242,6 +3237,7 @@ bool SuperWord::output() {
if (do_reserve_copy()) {
make_reversable.use_new();
}

NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("\n Final loop after SuperWord"); print_loop(true);})
return true;
}
Expand Down Expand Up @@ -3374,12 +3370,7 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
assert(VectorNode::is_populate_index_supported(iv_bt), "Should support");
const TypeVect* vt = TypeVect::make(iv_bt, vlen);
Node* vn = new PopulateIndexNode(iv(), _igvn.intcon(1), vt);
#ifdef ASSERT
if (TraceNewVectors) {
tty->print("new Vector node: ");
vn->dump();
}
#endif
VectorNode::trace_new_vector(vn, "SuperWord");
_igvn.register_new_node_with_optimizer(vn);
_phase->set_ctrl(vn, _phase->get_ctrl(opd));
return vn;
Expand Down Expand Up @@ -3452,12 +3443,7 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {

_igvn.register_new_node_with_optimizer(vn);
_phase->set_ctrl(vn, _phase->get_ctrl(opd));
#ifdef ASSERT
if (TraceNewVectors) {
tty->print("new Vector node: ");
vn->dump();
}
#endif
VectorNode::trace_new_vector(vn, "SuperWord");
return vn;
}

Expand Down Expand Up @@ -3489,12 +3475,7 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
}
_igvn.register_new_node_with_optimizer(pk);
_phase->set_ctrl(pk, _phase->get_ctrl(opd));
#ifdef ASSERT
if (TraceNewVectors) {
tty->print("new Vector node: ");
pk->dump();
}
#endif
VectorNode::trace_new_vector(pk, "SuperWord");
return pk;
}

Expand Down
2 changes: 1 addition & 1 deletion src/hotspot/share/opto/vectorIntrinsics.cpp
Expand Up @@ -1536,7 +1536,7 @@ bool LibraryCallKit::inline_vector_reduction() {
}
}

Node* init = ReductionNode::make_reduction_input(gvn(), opc, elem_bt);
Node* init = ReductionNode::make_identity_con_scalar(gvn(), opc, elem_bt);
Node* value = nullptr;
if (mask == nullptr) {
assert(!is_masked_op, "Masked op needs the mask value never null");
Expand Down

1 comment on commit 06b0a5e

@openjdk-notifier
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.