Skip to content

Commit 06b0a5e

Browse files
committed
8302652: [SuperWord] Reduction should happen after loop, when possible
Reviewed-by: kvn, pli, jbhateja, sviswanathan
1 parent 69f508a commit 06b0a5e

File tree

16 files changed

+1032
-239
lines changed

16 files changed

+1032
-239
lines changed

src/hotspot/share/opto/compile.cpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2844,12 +2844,7 @@ void Compile::process_logic_cone_root(PhaseIterGVN &igvn, Node *n, VectorSet &vi
28442844
if (mask == nullptr ||
28452845
Matcher::match_rule_supported_vector_masked(Op_MacroLogicV, vt->length(), vt->element_basic_type())) {
28462846
Node* macro_logic = xform_to_MacroLogicV(igvn, vt, partition, inputs);
2847-
#ifdef ASSERT
2848-
if (TraceNewVectors) {
2849-
tty->print("new Vector node: ");
2850-
macro_logic->dump();
2851-
}
2852-
#endif
2847+
VectorNode::trace_new_vector(macro_logic, "MacroLogic");
28532848
igvn.replace_node(n, macro_logic);
28542849
}
28552850
}

src/hotspot/share/opto/loopnode.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4634,6 +4634,16 @@ void PhaseIdealLoop::build_and_optimize() {
46344634
}
46354635
}
46364636
}
4637+
4638+
// Move UnorderedReduction out of counted loop. Can be introduced by SuperWord.
4639+
if (C->has_loops() && !C->major_progress()) {
4640+
for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
4641+
IdealLoopTree* lpt = iter.current();
4642+
if (lpt->is_counted() && lpt->is_innermost()) {
4643+
move_unordered_reduction_out_of_loop(lpt);
4644+
}
4645+
}
4646+
}
46374647
}
46384648

46394649
#ifndef PRODUCT

src/hotspot/share/opto/loopnode.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1486,6 +1486,9 @@ class PhaseIdealLoop : public PhaseTransform {
14861486
bool partial_peel( IdealLoopTree *loop, Node_List &old_new );
14871487
bool duplicate_loop_backedge(IdealLoopTree *loop, Node_List &old_new);
14881488

1489+
// Move UnorderedReduction out of loop if possible
1490+
void move_unordered_reduction_out_of_loop(IdealLoopTree* loop);
1491+
14891492
// Create a scheduled list of nodes control dependent on ctrl set.
14901493
void scheduled_nodelist( IdealLoopTree *loop, VectorSet& ctrl, Node_List &sched );
14911494
// Has a use in the vector set

src/hotspot/share/opto/loopopts.cpp

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
#include "opto/rootnode.hpp"
4242
#include "opto/subnode.hpp"
4343
#include "opto/subtypenode.hpp"
44+
#include "opto/vectornode.hpp"
4445
#include "utilities/macros.hpp"
4546

4647
//=============================================================================
@@ -4120,3 +4121,188 @@ bool PhaseIdealLoop::duplicate_loop_backedge(IdealLoopTree *loop, Node_List &old
41204121

41214122
return true;
41224123
}
4124+
4125+
// Having ReductionNodes in the loop is expensive. They need to recursively
4126+
// fold together the vector values, for every vectorized loop iteration. If
4127+
// we encounter the following pattern, we can vector accumulate the values
4128+
// inside the loop, and only have a single UnorderedReduction after the loop.
4129+
//
4130+
// CountedLoop init
4131+
// | |
4132+
// +------+ | +-----------------------+
4133+
// | | | |
4134+
// PhiNode (s) |
4135+
// | |
4136+
// | Vector |
4137+
// | | |
4138+
// UnorderedReduction (first_ur) |
4139+
// | |
4140+
// ... Vector |
4141+
// | | |
4142+
// UnorderedReduction (last_ur) |
4143+
// | |
4144+
// +---------------------+
4145+
//
4146+
// We patch the graph to look like this:
4147+
//
4148+
// CountedLoop identity_vector
4149+
// | |
4150+
// +-------+ | +---------------+
4151+
// | | | |
4152+
// PhiNode (v) |
4153+
// | |
4154+
// | Vector |
4155+
// | | |
4156+
// VectorAccumulator |
4157+
// | |
4158+
// ... Vector |
4159+
// | | |
4160+
// init VectorAccumulator |
4161+
// | | | |
4162+
// UnorderedReduction +-----------+
4163+
//
4164+
// We turned the scalar (s) Phi into a vectorized one (v). In the loop, we
4165+
// use vector_accumulators, which do the same reductions, but only element
4166+
// wise. This is a single operation per vector_accumulator, rather than many
4167+
// for a UnorderedReduction. We can then reduce the last vector_accumulator
4168+
// after the loop, and also reduce the init value into it.
4169+
// We can not do this with all reductions. Some reductions do not allow the
4170+
// reordering of operations (for example float addition).
4171+
void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) {
4172+
assert(!C->major_progress() && loop->is_counted() && loop->is_innermost(), "sanity");
4173+
4174+
// Find all Phi nodes with UnorderedReduction on backedge.
4175+
CountedLoopNode* cl = loop->_head->as_CountedLoop();
4176+
for (DUIterator_Fast jmax, j = cl->fast_outs(jmax); j < jmax; j++) {
4177+
Node* phi = cl->fast_out(j);
4178+
// We have a phi with a single use, and a UnorderedReduction on the backedge.
4179+
if (!phi->is_Phi() || phi->outcnt() != 1 || !phi->in(2)->is_UnorderedReduction()) {
4180+
continue;
4181+
}
4182+
4183+
UnorderedReductionNode* last_ur = phi->in(2)->as_UnorderedReduction();
4184+
4185+
// Determine types
4186+
const TypeVect* vec_t = last_ur->vect_type();
4187+
uint vector_length = vec_t->length();
4188+
BasicType bt = vec_t->element_basic_type();
4189+
const Type* bt_t = Type::get_const_basic_type(bt);
4190+
4191+
// Convert opcode from vector-reduction -> scalar -> normal-vector-op
4192+
const int sopc = VectorNode::scalar_opcode(last_ur->Opcode(), bt);
4193+
const int vopc = VectorNode::opcode(sopc, bt);
4194+
if (!Matcher::match_rule_supported_vector(vopc, vector_length, bt)) {
4195+
DEBUG_ONLY( last_ur->dump(); )
4196+
assert(false, "do not have normal vector op for this reduction");
4197+
continue; // not implemented -> fails
4198+
}
4199+
4200+
// Traverse up the chain of UnorderedReductions, checking that it loops back to
4201+
// the phi. Check that all UnorderedReductions only have a single use, except for
4202+
// the last (last_ur), which only has phi as a use in the loop, and all other uses
4203+
// are outside the loop.
4204+
UnorderedReductionNode* current = last_ur;
4205+
UnorderedReductionNode* first_ur = nullptr;
4206+
while (true) {
4207+
assert(current->is_UnorderedReduction(), "sanity");
4208+
4209+
// Expect no ctrl and a vector_input from within the loop.
4210+
Node* ctrl = current->in(0);
4211+
Node* vector_input = current->in(2);
4212+
if (ctrl != nullptr || get_ctrl(vector_input) != cl) {
4213+
DEBUG_ONLY( current->dump(1); )
4214+
assert(false, "reduction has ctrl or bad vector_input");
4215+
break; // Chain traversal fails.
4216+
}
4217+
4218+
// Expect single use of UnorderedReduction, except for last_ur.
4219+
if (current == last_ur) {
4220+
// Expect all uses to be outside the loop, except phi.
4221+
for (DUIterator_Fast kmax, k = current->fast_outs(kmax); k < kmax; k++) {
4222+
Node* use = current->fast_out(k);
4223+
if (use != phi && ctrl_or_self(use) == cl) {
4224+
DEBUG_ONLY( current->dump(-1); )
4225+
assert(false, "reduction has use inside loop");
4226+
break; // Chain traversal fails.
4227+
}
4228+
}
4229+
} else {
4230+
if (current->outcnt() != 1) {
4231+
break; // Chain traversal fails.
4232+
}
4233+
}
4234+
4235+
// Expect another UnorderedReduction or phi as the scalar input.
4236+
Node* scalar_input = current->in(1);
4237+
if (scalar_input->is_UnorderedReduction() &&
4238+
scalar_input->Opcode() == current->Opcode()) {
4239+
// Move up the UnorderedReduction chain.
4240+
current = scalar_input->as_UnorderedReduction();
4241+
} else if (scalar_input == phi) {
4242+
// Chain terminates at phi.
4243+
first_ur = current;
4244+
current = nullptr;
4245+
break; // Success.
4246+
} else {
4247+
DEBUG_ONLY( current->dump(1); )
4248+
assert(false, "scalar_input is neither phi nor a matchin reduction");
4249+
break; // Chain traversal fails.
4250+
}
4251+
}
4252+
if (current != nullptr) {
4253+
// Chain traversal was not successful.
4254+
continue;
4255+
}
4256+
assert(first_ur != nullptr, "must have successfully terminated chain traversal");
4257+
4258+
Node* identity_scalar = ReductionNode::make_identity_con_scalar(_igvn, sopc, bt);
4259+
set_ctrl(identity_scalar, C->root());
4260+
VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector_length, bt_t);
4261+
register_new_node(identity_vector, C->root());
4262+
assert(vec_t == identity_vector->vect_type(), "matching vector type");
4263+
VectorNode::trace_new_vector(identity_vector, "UnorderedReduction");
4264+
4265+
// Turn the scalar phi into a vector phi.
4266+
_igvn.rehash_node_delayed(phi);
4267+
Node* init = phi->in(1); // Remember init before replacing it.
4268+
phi->set_req_X(1, identity_vector, &_igvn);
4269+
phi->as_Type()->set_type(vec_t);
4270+
_igvn.set_type(phi, vec_t);
4271+
4272+
// Traverse down the chain of UnorderedReductions, and replace them with vector_accumulators.
4273+
current = first_ur;
4274+
while (true) {
4275+
// Create vector_accumulator to replace current.
4276+
Node* last_vector_accumulator = current->in(1);
4277+
Node* vector_input = current->in(2);
4278+
VectorNode* vector_accumulator = VectorNode::make(vopc, last_vector_accumulator, vector_input, vec_t);
4279+
register_new_node(vector_accumulator, cl);
4280+
_igvn.replace_node(current, vector_accumulator);
4281+
VectorNode::trace_new_vector(vector_accumulator, "UnorderedReduction");
4282+
if (current == last_ur) {
4283+
break;
4284+
}
4285+
current = vector_accumulator->unique_out()->as_UnorderedReduction();
4286+
}
4287+
4288+
// Create post-loop reduction.
4289+
Node* last_accumulator = phi->in(2);
4290+
Node* post_loop_reduction = ReductionNode::make(sopc, nullptr, init, last_accumulator, bt);
4291+
4292+
// Take over uses of last_accumulator that are not in the loop.
4293+
for (DUIterator i = last_accumulator->outs(); last_accumulator->has_out(i); i++) {
4294+
Node* use = last_accumulator->out(i);
4295+
if (use != phi && use != post_loop_reduction) {
4296+
assert(ctrl_or_self(use) != cl, "use must be outside loop");
4297+
use->replace_edge(last_accumulator, post_loop_reduction, &_igvn);
4298+
--i;
4299+
}
4300+
}
4301+
register_new_node(post_loop_reduction, get_late_ctrl(post_loop_reduction, cl));
4302+
VectorNode::trace_new_vector(post_loop_reduction, "UnorderedReduction");
4303+
4304+
assert(last_accumulator->outcnt() == 2, "last_accumulator has 2 uses: phi and post_loop_reduction");
4305+
assert(post_loop_reduction->outcnt() > 0, "should have taken over all non loop uses of last_accumulator");
4306+
assert(phi->outcnt() == 1, "accumulator is the only use of phi");
4307+
}
4308+
}

src/hotspot/share/opto/node.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ class Pipeline;
151151
class PopulateIndexNode;
152152
class ProjNode;
153153
class RangeCheckNode;
154+
class ReductionNode;
154155
class RegMask;
155156
class RegionNode;
156157
class RootNode;
@@ -164,6 +165,7 @@ class SubTypeCheckNode;
164165
class Type;
165166
class TypeNode;
166167
class UnlockNode;
168+
class UnorderedReductionNode;
167169
class VectorNode;
168170
class LoadVectorNode;
169171
class LoadVectorMaskedNode;
@@ -718,6 +720,8 @@ class Node {
718720
DEFINE_CLASS_ID(CompressV, Vector, 4)
719721
DEFINE_CLASS_ID(ExpandV, Vector, 5)
720722
DEFINE_CLASS_ID(CompressM, Vector, 6)
723+
DEFINE_CLASS_ID(Reduction, Vector, 7)
724+
DEFINE_CLASS_ID(UnorderedReduction, Reduction, 0)
721725
DEFINE_CLASS_ID(Con, Type, 8)
722726
DEFINE_CLASS_ID(ConI, Con, 0)
723727

@@ -941,6 +945,7 @@ class Node {
941945
DEFINE_CLASS_QUERY(PCTable)
942946
DEFINE_CLASS_QUERY(Phi)
943947
DEFINE_CLASS_QUERY(Proj)
948+
DEFINE_CLASS_QUERY(Reduction)
944949
DEFINE_CLASS_QUERY(Region)
945950
DEFINE_CLASS_QUERY(Root)
946951
DEFINE_CLASS_QUERY(SafePoint)
@@ -950,6 +955,7 @@ class Node {
950955
DEFINE_CLASS_QUERY(Sub)
951956
DEFINE_CLASS_QUERY(SubTypeCheck)
952957
DEFINE_CLASS_QUERY(Type)
958+
DEFINE_CLASS_QUERY(UnorderedReduction)
953959
DEFINE_CLASS_QUERY(Vector)
954960
DEFINE_CLASS_QUERY(VectorMaskCmp)
955961
DEFINE_CLASS_QUERY(VectorUnbox)

src/hotspot/share/opto/superword.cpp

Lines changed: 5 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3197,12 +3197,7 @@ bool SuperWord::output() {
31973197
if (vlen_in_bytes > max_vlen_in_bytes) {
31983198
max_vlen_in_bytes = vlen_in_bytes;
31993199
}
3200-
#ifdef ASSERT
3201-
if (TraceNewVectors) {
3202-
tty->print("new Vector node: ");
3203-
vn->dump();
3204-
}
3205-
#endif
3200+
VectorNode::trace_new_vector(vn, "SuperWord");
32063201
}
32073202
}//for (int i = 0; i < _block.length(); i++)
32083203

@@ -3242,6 +3237,7 @@ bool SuperWord::output() {
32423237
if (do_reserve_copy()) {
32433238
make_reversable.use_new();
32443239
}
3240+
32453241
NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("\n Final loop after SuperWord"); print_loop(true);})
32463242
return true;
32473243
}
@@ -3374,12 +3370,7 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
33743370
assert(VectorNode::is_populate_index_supported(iv_bt), "Should support");
33753371
const TypeVect* vt = TypeVect::make(iv_bt, vlen);
33763372
Node* vn = new PopulateIndexNode(iv(), _igvn.intcon(1), vt);
3377-
#ifdef ASSERT
3378-
if (TraceNewVectors) {
3379-
tty->print("new Vector node: ");
3380-
vn->dump();
3381-
}
3382-
#endif
3373+
VectorNode::trace_new_vector(vn, "SuperWord");
33833374
_igvn.register_new_node_with_optimizer(vn);
33843375
_phase->set_ctrl(vn, _phase->get_ctrl(opd));
33853376
return vn;
@@ -3452,12 +3443,7 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
34523443

34533444
_igvn.register_new_node_with_optimizer(vn);
34543445
_phase->set_ctrl(vn, _phase->get_ctrl(opd));
3455-
#ifdef ASSERT
3456-
if (TraceNewVectors) {
3457-
tty->print("new Vector node: ");
3458-
vn->dump();
3459-
}
3460-
#endif
3446+
VectorNode::trace_new_vector(vn, "SuperWord");
34613447
return vn;
34623448
}
34633449

@@ -3489,12 +3475,7 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
34893475
}
34903476
_igvn.register_new_node_with_optimizer(pk);
34913477
_phase->set_ctrl(pk, _phase->get_ctrl(opd));
3492-
#ifdef ASSERT
3493-
if (TraceNewVectors) {
3494-
tty->print("new Vector node: ");
3495-
pk->dump();
3496-
}
3497-
#endif
3478+
VectorNode::trace_new_vector(pk, "SuperWord");
34983479
return pk;
34993480
}
35003481

src/hotspot/share/opto/vectorIntrinsics.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1536,7 +1536,7 @@ bool LibraryCallKit::inline_vector_reduction() {
15361536
}
15371537
}
15381538

1539-
Node* init = ReductionNode::make_reduction_input(gvn(), opc, elem_bt);
1539+
Node* init = ReductionNode::make_identity_con_scalar(gvn(), opc, elem_bt);
15401540
Node* value = nullptr;
15411541
if (mask == nullptr) {
15421542
assert(!is_masked_op, "Masked op needs the mask value never null");

0 commit comments

Comments
 (0)