Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions src/hotspot/share/opto/compile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2844,12 +2844,7 @@ void Compile::process_logic_cone_root(PhaseIterGVN &igvn, Node *n, VectorSet &vi
if (mask == nullptr ||
Matcher::match_rule_supported_vector_masked(Op_MacroLogicV, vt->length(), vt->element_basic_type())) {
Node* macro_logic = xform_to_MacroLogicV(igvn, vt, partition, inputs);
#ifdef ASSERT
if (TraceNewVectors) {
tty->print("new Vector node: ");
macro_logic->dump();
}
#endif
VectorNode::trace_new_vector(macro_logic, "MacroLogic");
igvn.replace_node(n, macro_logic);
}
}
Expand Down
10 changes: 10 additions & 0 deletions src/hotspot/share/opto/loopnode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4628,6 +4628,16 @@ void PhaseIdealLoop::build_and_optimize() {
}
}
}

// Move UnorderedReduction out of counted loop. Can be introduced by SuperWord.
if (C->has_loops() && !C->major_progress()) {
for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
IdealLoopTree* lpt = iter.current();
if (lpt->is_counted() && lpt->is_innermost()) {
move_unordered_reduction_out_of_loop(lpt);
}
}
}
}

#ifndef PRODUCT
Expand Down
3 changes: 3 additions & 0 deletions src/hotspot/share/opto/loopnode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1484,6 +1484,9 @@ class PhaseIdealLoop : public PhaseTransform {
bool partial_peel( IdealLoopTree *loop, Node_List &old_new );
bool duplicate_loop_backedge(IdealLoopTree *loop, Node_List &old_new);

// Move UnorderedReduction out of loop if possible
void move_unordered_reduction_out_of_loop(IdealLoopTree* loop);

// Create a scheduled list of nodes control dependent on ctrl set.
void scheduled_nodelist( IdealLoopTree *loop, VectorSet& ctrl, Node_List &sched );
// Has a use in the vector set
Expand Down
186 changes: 186 additions & 0 deletions src/hotspot/share/opto/loopopts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
#include "opto/rootnode.hpp"
#include "opto/subnode.hpp"
#include "opto/subtypenode.hpp"
#include "opto/vectornode.hpp"
#include "utilities/macros.hpp"

//=============================================================================
Expand Down Expand Up @@ -4120,3 +4121,188 @@ bool PhaseIdealLoop::duplicate_loop_backedge(IdealLoopTree *loop, Node_List &old

return true;
}

// Having ReductionNodes in the loop is expensive. They need to recursively
// fold together the vector values, for every vectorized loop iteration. If
// we encounter the following pattern, we can vector accumulate the values
// inside the loop, and only have a single UnorderedReduction after the loop.
//
// CountedLoop init
// | |
// +------+ | +-----------------------+
// | | | |
// PhiNode (s) |
// | |
// | Vector |
// | | |
// UnorderedReduction (first_ur) |
// | |
// ... Vector |
// | | |
// UnorderedReduction (last_ur) |
// | |
// +---------------------+
//
// We patch the graph to look like this:
//
// CountedLoop identity_vector
// | |
// +-------+ | +---------------+
// | | | |
// PhiNode (v) |
// | |
// | Vector |
// | | |
// VectorAccumulator |
// | |
// ... Vector |
// | | |
// init VectorAccumulator |
// | | | |
// UnorderedReduction +-----------+
//
// We turned the scalar (s) Phi into a vectorized one (v). In the loop, we
// use vector_accumulators, which do the same reductions, but only element
// wise. This is a single operation per vector_accumulator, rather than many
// for a UnorderedReduction. We can then reduce the last vector_accumulator
// after the loop, and also reduce the init value into it.
// We can not do this with all reductions. Some reductions do not allow the
// reordering of operations (for example float addition).
void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
assert(!C->major_progress() && loop->is_counted() && loop->is_innermost(), "sanity");

// Find all Phi nodes with UnorderedReduction on backedge.
CountedLoopNode* cl = loop->_head->as_CountedLoop();
for (DUIterator_Fast jmax, j = cl->fast_outs(jmax); j < jmax; j++) {
Node* phi = cl->fast_out(j);
// We have a phi with a single use, and a UnorderedReduction on the backedge.
if (!phi->is_Phi() || phi->outcnt() != 1 || !phi->in(2)->is_UnorderedReduction()) {
continue;
}

UnorderedReductionNode* last_ur = phi->in(2)->as_UnorderedReduction();

// Determine types
const TypeVect* vec_t = last_ur->vect_type();
uint vector_length = vec_t->length();
BasicType bt = vec_t->element_basic_type();
const Type* bt_t = Type::get_const_basic_type(bt);

// Convert opcode from vector-reduction -> scalar -> normal-vector-op
const int sopc = VectorNode::scalar_opcode(last_ur->Opcode(), bt);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Other changes looks good to me, can you rename VectorNode::scalar_opcode to ReductionNode::scalar_opcode
, also move out vector opcode cases into a separate vector-to-scalar mapping routine if needed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it not better to have VectorNode::scalar_opcode? It is more general - maybe it is useful in the future.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it not better to have VectorNode::scalar_opcode? It is more general - maybe it is useful in the future.

Not a blocker, but we intend to get a scalar opcode for ReductionNode, we have different factory method for Vector/Reduction Nodes, you can keep it for now

Best Regards,
Jatin

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jatin-bhateja I see your point. On the other hand, we would have quite some code duplication handling all the BasicType cases for every operation. I'll leave it the way I have it now, and we can still reconsider it if we want to in the future.

const int vopc = VectorNode::opcode(sopc, bt);
if (!Matcher::match_rule_supported_vector(vopc, vector_length, bt)) {
DEBUG_ONLY( last_ur->dump(); )
assert(false, "do not have normal vector op for this reduction");
continue; // not implemented -> fails
}

// Traverse up the chain of UnorderedReductions, checking that it loops back to
// the phi. Check that all UnorderedReductions only have a single use, except for
// the last (last_ur), which only has phi as a use in the loop, and all other uses
// are outside the loop.
UnorderedReductionNode* current = last_ur;
UnorderedReductionNode* first_ur = nullptr;
while (true) {
assert(current->is_UnorderedReduction(), "sanity");

// Expect no ctrl and a vector_input from within the loop.
Node* ctrl = current->in(0);
Node* vector_input = current->in(2);
if (ctrl != nullptr || get_ctrl(vector_input) != cl) {
DEBUG_ONLY( current->dump(1); )
assert(false, "reduction has ctrl or bad vector_input");
break; // Chain traversal fails.
}

// Expect single use of UnorderedReduction, except for last_ur.
if (current == last_ur) {
// Expect all uses to be outside the loop, except phi.
for (DUIterator_Fast kmax, k = current->fast_outs(kmax); k < kmax; k++) {
Node* use = current->fast_out(k);
if (use != phi && ctrl_or_self(use) == cl) {
DEBUG_ONLY( current->dump(-1); )
assert(false, "reduction has use inside loop");
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have been wondering, it is right to bailout here from the optimization but why do we assert here? It is perfectly legal (if not very meaningful) to have a scalar use of the last unordered reduction within the loop. This will still auto vectorize as the reduction is to a scalar. e.g. a slight modification of the SumRed_Int.java still auto vectorizes and has a use of the last unordered reduction within the loop:
public static int sumReductionImplement(
int[] a,
int[] b,
int[] c,
int total) {
int sum = 0;
for (int i = 0; i < a.length; i++) {
total += (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
sum = total + i;
}
return total + sum;
}
Do you think this is a valid concern?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, the assert is not very necessary, but I'd rather have an assert more in there and figure out what cases I missed when the fuzzer eventually finds a case. But if it is wished I can also just remove that assert.

I wrote this Test.java:

class Test {
    static final int RANGE = 1024;
    static final int ITER  = 10_000;

    static void init(int[] data) {
        for (int i = 0; i < RANGE; i++) {
            data[i] = i + 1;
        }
    }

    static int test(int[] data, int sum) {
        int x = 0;
        for (int i = 0; i < RANGE; i++) {
            sum += 11 * data[i];
            x = sum & i; // what happens with this AndI ?
        }
        return sum + x;
    }

    public static void main(String[] args) {
        int[] data = new int[RANGE];
        init(data);
        for (int i = 0; i < ITER; i++) {
            test(data, i);
        }
    }
}

And ran it like this, with my patch:

./java -Xbatch -XX:CompileCommand=compileonly,Test::test  -XX:+TraceNewVectors -XX:+TraceSuperWord Test.java

Everything vectorized as usual. But what happens with the AndI? It actually drops outside the loop. Its left input is the AddReductionVI, and the right input is (Phi #tripcount) + 63 (the last i thus already drops outside the loop).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: If I have uses of the reduction in each iteration, then we already refuse to vectorize the reduction, as in this case:

    static int test(int[] data, int sum) {
        int x = 0;
        for (int i = 0; i < RANGE; i++) {
            sum += 11 * data[i];
            x += sum & i;  // vector use of sum prevents vectorization of sum's reduction-vectorization -> whole chain not vectorized
        }
        return sum + x;
    }

Copy link
Contributor Author

@eme64 eme64 May 12, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My conclusion, given my best understanding: eigher we have a use of the sum in all iterations, which prevents vectorization of the reduction. Or we only have a use of the last iteration, and it drops out of the loop already.

So if there is such an odd example, I'd rather we run into an assert in debug and look at it again. Maybe it would be perfectly legal, or maybe it reveals a bug here or elsewhere in the reduction code.

@sviswa7 what do you think?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, but this hits one of my asserts:

    static int test(int[] data, int sum) {
        int x = 0;
        for (int i = 0; i < RANGE; i+=8) {
            sum += 11 * data[i+0];
            sum += 11 * data[i+1];
            sum += 11 * data[i+2];
            sum += 11 * data[i+3];
            x = sum + i;
            sum += 11 * data[i+4];
            sum += 11 * data[i+5];
            sum += 11 * data[i+6];
            sum += 11 * data[i+7];
        }
        return sum + x;
    }

With

./java -Xbatch -XX:CompileCommand=compileonly,Test::test -XX:+TraceNewVectors -XX:+TraceSuperWord -XX:MaxVectorSize=16 Test.java

Triggers

assert(false, "reduction (not last) has more than one use");

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will add this as a regression test, and remove that assert. Thanks @sviswa7 for making me look at this more closely :)

Still, I think it may be valuable to keep these two asserts - both indicate that something strange has happened:

assert(false, "reduction has use inside loop");

assert(false, "reduction has ctrl or bad vector_input");

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good to me.

break; // Chain traversal fails.
}
}
} else {
if (current->outcnt() != 1) {
break; // Chain traversal fails.
}
}

// Expect another UnorderedReduction or phi as the scalar input.
Node* scalar_input = current->in(1);
if (scalar_input->is_UnorderedReduction() &&
scalar_input->Opcode() == current->Opcode()) {
// Move up the UnorderedReduction chain.
current = scalar_input->as_UnorderedReduction();
} else if (scalar_input == phi) {
// Chain terminates at phi.
first_ur = current;
current = nullptr;
break; // Success.
} else {
DEBUG_ONLY( current->dump(1); )
assert(false, "scalar_input is neither phi nor a matchin reduction");
break; // Chain traversal fails.
}
}
if (current != nullptr) {
// Chain traversal was not successful.
continue;
}
assert(first_ur != nullptr, "must have successfully terminated chain traversal");

Node* identity_scalar = ReductionNode::make_identity_con_scalar(_igvn, sopc, bt);
set_ctrl(identity_scalar, C->root());
VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector_length, bt_t);
register_new_node(identity_vector, C->root());
assert(vec_t == identity_vector->vect_type(), "matching vector type");
VectorNode::trace_new_vector(identity_vector, "UnorderedReduction");

// Turn the scalar phi into a vector phi.
_igvn.rehash_node_delayed(phi);
Node* init = phi->in(1); // Remember init before replacing it.
phi->set_req_X(1, identity_vector, &_igvn);
phi->as_Type()->set_type(vec_t);
_igvn.set_type(phi, vec_t);

// Traverse down the chain of UnorderedReductions, and replace them with vector_accumulators.
current = first_ur;
while (true) {
// Create vector_accumulator to replace current.
Node* last_vector_accumulator = current->in(1);
Node* vector_input = current->in(2);
VectorNode* vector_accumulator = VectorNode::make(vopc, last_vector_accumulator, vector_input, vec_t);
register_new_node(vector_accumulator, cl);
_igvn.replace_node(current, vector_accumulator);
VectorNode::trace_new_vector(vector_accumulator, "UnorderedReduction");
if (current == last_ur) {
break;
}
current = vector_accumulator->unique_out()->as_UnorderedReduction();
}

// Create post-loop reduction.
Node* last_accumulator = phi->in(2);
Node* post_loop_reduction = ReductionNode::make(sopc, nullptr, init, last_accumulator, bt);

// Take over uses of last_accumulator that are not in the loop.
for (DUIterator i = last_accumulator->outs(); last_accumulator->has_out(i); i++) {
Node* use = last_accumulator->out(i);
if (use != phi && use != post_loop_reduction) {
assert(ctrl_or_self(use) != cl, "use must be outside loop");
use->replace_edge(last_accumulator, post_loop_reduction, &_igvn);
--i;
}
}
register_new_node(post_loop_reduction, get_late_ctrl(post_loop_reduction, cl));
VectorNode::trace_new_vector(post_loop_reduction, "UnorderedReduction");

assert(last_accumulator->outcnt() == 2, "last_accumulator has 2 uses: phi and post_loop_reduction");
assert(post_loop_reduction->outcnt() > 0, "should have taken over all non loop uses of last_accumulator");
assert(phi->outcnt() == 1, "accumulator is the only use of phi");
}
}
6 changes: 6 additions & 0 deletions src/hotspot/share/opto/node.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ class Pipeline;
class PopulateIndexNode;
class ProjNode;
class RangeCheckNode;
class ReductionNode;
class RegMask;
class RegionNode;
class RootNode;
Expand All @@ -164,6 +165,7 @@ class SubTypeCheckNode;
class Type;
class TypeNode;
class UnlockNode;
class UnorderedReductionNode;
class VectorNode;
class LoadVectorNode;
class LoadVectorMaskedNode;
Expand Down Expand Up @@ -718,6 +720,8 @@ class Node {
DEFINE_CLASS_ID(CompressV, Vector, 4)
DEFINE_CLASS_ID(ExpandV, Vector, 5)
DEFINE_CLASS_ID(CompressM, Vector, 6)
DEFINE_CLASS_ID(Reduction, Vector, 7)
DEFINE_CLASS_ID(UnorderedReduction, Reduction, 0)
DEFINE_CLASS_ID(Con, Type, 8)
DEFINE_CLASS_ID(ConI, Con, 0)

Expand Down Expand Up @@ -941,6 +945,7 @@ class Node {
DEFINE_CLASS_QUERY(PCTable)
DEFINE_CLASS_QUERY(Phi)
DEFINE_CLASS_QUERY(Proj)
DEFINE_CLASS_QUERY(Reduction)
DEFINE_CLASS_QUERY(Region)
DEFINE_CLASS_QUERY(Root)
DEFINE_CLASS_QUERY(SafePoint)
Expand All @@ -950,6 +955,7 @@ class Node {
DEFINE_CLASS_QUERY(Sub)
DEFINE_CLASS_QUERY(SubTypeCheck)
DEFINE_CLASS_QUERY(Type)
DEFINE_CLASS_QUERY(UnorderedReduction)
DEFINE_CLASS_QUERY(Vector)
DEFINE_CLASS_QUERY(VectorMaskCmp)
DEFINE_CLASS_QUERY(VectorUnbox)
Expand Down
29 changes: 5 additions & 24 deletions src/hotspot/share/opto/superword.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3329,12 +3329,7 @@ bool SuperWord::output() {
if (vlen_in_bytes > max_vlen_in_bytes) {
max_vlen_in_bytes = vlen_in_bytes;
}
#ifdef ASSERT
if (TraceNewVectors) {
tty->print("new Vector node: ");
vn->dump();
}
#endif
VectorNode::trace_new_vector(vn, "SuperWord");
}
}//for (int i = 0; i < _block.length(); i++)

Expand Down Expand Up @@ -3374,6 +3369,7 @@ bool SuperWord::output() {
if (do_reserve_copy()) {
make_reversable.use_new();
}

NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("\n Final loop after SuperWord"); print_loop(true);})
return true;
}
Expand Down Expand Up @@ -3506,12 +3502,7 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
assert(VectorNode::is_populate_index_supported(iv_bt), "Should support");
const TypeVect* vt = TypeVect::make(iv_bt, vlen);
Node* vn = new PopulateIndexNode(iv(), _igvn.intcon(1), vt);
#ifdef ASSERT
if (TraceNewVectors) {
tty->print("new Vector node: ");
vn->dump();
}
#endif
VectorNode::trace_new_vector(vn, "SuperWord");
_igvn.register_new_node_with_optimizer(vn);
_phase->set_ctrl(vn, _phase->get_ctrl(opd));
return vn;
Expand Down Expand Up @@ -3584,12 +3575,7 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {

_igvn.register_new_node_with_optimizer(vn);
_phase->set_ctrl(vn, _phase->get_ctrl(opd));
#ifdef ASSERT
if (TraceNewVectors) {
tty->print("new Vector node: ");
vn->dump();
}
#endif
VectorNode::trace_new_vector(vn, "SuperWord");
return vn;
}

Expand Down Expand Up @@ -3621,12 +3607,7 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
}
_igvn.register_new_node_with_optimizer(pk);
_phase->set_ctrl(pk, _phase->get_ctrl(opd));
#ifdef ASSERT
if (TraceNewVectors) {
tty->print("new Vector node: ");
pk->dump();
}
#endif
VectorNode::trace_new_vector(pk, "SuperWord");
return pk;
}

Expand Down
2 changes: 1 addition & 1 deletion src/hotspot/share/opto/vectorIntrinsics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1536,7 +1536,7 @@ bool LibraryCallKit::inline_vector_reduction() {
}
}

Node* init = ReductionNode::make_reduction_input(gvn(), opc, elem_bt);
Node* init = ReductionNode::make_identity_con_scalar(gvn(), opc, elem_bt);
Node* value = nullptr;
if (mask == nullptr) {
assert(!is_masked_op, "Masked op needs the mask value never null");
Expand Down
Loading