|
41 | 41 | #include "opto/rootnode.hpp" |
42 | 42 | #include "opto/subnode.hpp" |
43 | 43 | #include "opto/subtypenode.hpp" |
| 44 | +#include "opto/vectornode.hpp" |
44 | 45 | #include "utilities/macros.hpp" |
45 | 46 |
|
46 | 47 | //============================================================================= |
@@ -4120,3 +4121,188 @@ bool PhaseIdealLoop::duplicate_loop_backedge(IdealLoopTree *loop, Node_List &old |
4120 | 4121 |
|
4121 | 4122 | return true; |
4122 | 4123 | } |
| 4124 | + |
| 4125 | +// Having ReductionNodes in the loop is expensive. They need to recursively |
| 4126 | +// fold together the vector values, for every vectorized loop iteration. If |
| 4127 | +// we encounter the following pattern, we can vector accumulate the values |
| 4128 | +// inside the loop, and only have a single UnorderedReduction after the loop. |
| 4129 | +// |
| 4130 | +// CountedLoop init |
| 4131 | +// | | |
| 4132 | +// +------+ | +-----------------------+ |
| 4133 | +// | | | | |
| 4134 | +// PhiNode (s) | |
| 4135 | +// | | |
| 4136 | +// | Vector | |
| 4137 | +// | | | |
| 4138 | +// UnorderedReduction (first_ur) | |
| 4139 | +// | | |
| 4140 | +// ... Vector | |
| 4141 | +// | | | |
| 4142 | +// UnorderedReduction (last_ur) | |
| 4143 | +// | | |
| 4144 | +// +---------------------+ |
| 4145 | +// |
| 4146 | +// We patch the graph to look like this: |
| 4147 | +// |
| 4148 | +// CountedLoop identity_vector |
| 4149 | +// | | |
| 4150 | +// +-------+ | +---------------+ |
| 4151 | +// | | | | |
| 4152 | +// PhiNode (v) | |
| 4153 | +// | | |
| 4154 | +// | Vector | |
| 4155 | +// | | | |
| 4156 | +// VectorAccumulator | |
| 4157 | +// | | |
| 4158 | +// ... Vector | |
| 4159 | +// | | | |
| 4160 | +// init VectorAccumulator | |
| 4161 | +// | | | | |
| 4162 | +// UnorderedReduction +-----------+ |
| 4163 | +// |
| 4164 | +// We turned the scalar (s) Phi into a vectorized one (v). In the loop, we |
| 4165 | +// use vector_accumulators, which do the same reductions, but only element |
| 4166 | +// wise. This is a single operation per vector_accumulator, rather than many |
| 4167 | +// for a UnorderedReduction. We can then reduce the last vector_accumulator |
| 4168 | +// after the loop, and also reduce the init value into it. |
| 4169 | +// We can not do this with all reductions. Some reductions do not allow the |
| 4170 | +// reordering of operations (for example float addition). |
| 4171 | +void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { |
| 4172 | + assert(!C->major_progress() && loop->is_counted() && loop->is_innermost(), "sanity"); |
| 4173 | + |
| 4174 | + // Find all Phi nodes with UnorderedReduction on backedge. |
| 4175 | + CountedLoopNode* cl = loop->_head->as_CountedLoop(); |
| 4176 | + for (DUIterator_Fast jmax, j = cl->fast_outs(jmax); j < jmax; j++) { |
| 4177 | + Node* phi = cl->fast_out(j); |
| 4178 | + // We have a phi with a single use, and a UnorderedReduction on the backedge. |
| 4179 | + if (!phi->is_Phi() || phi->outcnt() != 1 || !phi->in(2)->is_UnorderedReduction()) { |
| 4180 | + continue; |
| 4181 | + } |
| 4182 | + |
| 4183 | + UnorderedReductionNode* last_ur = phi->in(2)->as_UnorderedReduction(); |
| 4184 | + |
| 4185 | + // Determine types |
| 4186 | + const TypeVect* vec_t = last_ur->vect_type(); |
| 4187 | + uint vector_length = vec_t->length(); |
| 4188 | + BasicType bt = vec_t->element_basic_type(); |
| 4189 | + const Type* bt_t = Type::get_const_basic_type(bt); |
| 4190 | + |
| 4191 | + // Convert opcode from vector-reduction -> scalar -> normal-vector-op |
| 4192 | + const int sopc = VectorNode::scalar_opcode(last_ur->Opcode(), bt); |
| 4193 | + const int vopc = VectorNode::opcode(sopc, bt); |
| 4194 | + if (!Matcher::match_rule_supported_vector(vopc, vector_length, bt)) { |
| 4195 | + DEBUG_ONLY( last_ur->dump(); ) |
| 4196 | + assert(false, "do not have normal vector op for this reduction"); |
| 4197 | + continue; // not implemented -> fails |
| 4198 | + } |
| 4199 | + |
| 4200 | + // Traverse up the chain of UnorderedReductions, checking that it loops back to |
| 4201 | + // the phi. Check that all UnorderedReductions only have a single use, except for |
| 4202 | + // the last (last_ur), which only has phi as a use in the loop, and all other uses |
| 4203 | + // are outside the loop. |
| 4204 | + UnorderedReductionNode* current = last_ur; |
| 4205 | + UnorderedReductionNode* first_ur = nullptr; |
| 4206 | + while (true) { |
| 4207 | + assert(current->is_UnorderedReduction(), "sanity"); |
| 4208 | + |
| 4209 | + // Expect no ctrl and a vector_input from within the loop. |
| 4210 | + Node* ctrl = current->in(0); |
| 4211 | + Node* vector_input = current->in(2); |
| 4212 | + if (ctrl != nullptr || get_ctrl(vector_input) != cl) { |
| 4213 | + DEBUG_ONLY( current->dump(1); ) |
| 4214 | + assert(false, "reduction has ctrl or bad vector_input"); |
| 4215 | + break; // Chain traversal fails. |
| 4216 | + } |
| 4217 | + |
| 4218 | + // Expect single use of UnorderedReduction, except for last_ur. |
| 4219 | + if (current == last_ur) { |
| 4220 | + // Expect all uses to be outside the loop, except phi. |
| 4221 | + for (DUIterator_Fast kmax, k = current->fast_outs(kmax); k < kmax; k++) { |
| 4222 | + Node* use = current->fast_out(k); |
| 4223 | + if (use != phi && ctrl_or_self(use) == cl) { |
| 4224 | + DEBUG_ONLY( current->dump(-1); ) |
| 4225 | + assert(false, "reduction has use inside loop"); |
| 4226 | + break; // Chain traversal fails. |
| 4227 | + } |
| 4228 | + } |
| 4229 | + } else { |
| 4230 | + if (current->outcnt() != 1) { |
| 4231 | + break; // Chain traversal fails. |
| 4232 | + } |
| 4233 | + } |
| 4234 | + |
| 4235 | + // Expect another UnorderedReduction or phi as the scalar input. |
| 4236 | + Node* scalar_input = current->in(1); |
| 4237 | + if (scalar_input->is_UnorderedReduction() && |
| 4238 | + scalar_input->Opcode() == current->Opcode()) { |
| 4239 | + // Move up the UnorderedReduction chain. |
| 4240 | + current = scalar_input->as_UnorderedReduction(); |
| 4241 | + } else if (scalar_input == phi) { |
| 4242 | + // Chain terminates at phi. |
| 4243 | + first_ur = current; |
| 4244 | + current = nullptr; |
| 4245 | + break; // Success. |
| 4246 | + } else { |
| 4247 | + DEBUG_ONLY( current->dump(1); ) |
| 4248 | + assert(false, "scalar_input is neither phi nor a matchin reduction"); |
| 4249 | + break; // Chain traversal fails. |
| 4250 | + } |
| 4251 | + } |
| 4252 | + if (current != nullptr) { |
| 4253 | + // Chain traversal was not successful. |
| 4254 | + continue; |
| 4255 | + } |
| 4256 | + assert(first_ur != nullptr, "must have successfully terminated chain traversal"); |
| 4257 | + |
| 4258 | + Node* identity_scalar = ReductionNode::make_identity_con_scalar(_igvn, sopc, bt); |
| 4259 | + set_ctrl(identity_scalar, C->root()); |
| 4260 | + VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector_length, bt_t); |
| 4261 | + register_new_node(identity_vector, C->root()); |
| 4262 | + assert(vec_t == identity_vector->vect_type(), "matching vector type"); |
| 4263 | + VectorNode::trace_new_vector(identity_vector, "UnorderedReduction"); |
| 4264 | + |
| 4265 | + // Turn the scalar phi into a vector phi. |
| 4266 | + _igvn.rehash_node_delayed(phi); |
| 4267 | + Node* init = phi->in(1); // Remember init before replacing it. |
| 4268 | + phi->set_req_X(1, identity_vector, &_igvn); |
| 4269 | + phi->as_Type()->set_type(vec_t); |
| 4270 | + _igvn.set_type(phi, vec_t); |
| 4271 | + |
| 4272 | + // Traverse down the chain of UnorderedReductions, and replace them with vector_accumulators. |
| 4273 | + current = first_ur; |
| 4274 | + while (true) { |
| 4275 | + // Create vector_accumulator to replace current. |
| 4276 | + Node* last_vector_accumulator = current->in(1); |
| 4277 | + Node* vector_input = current->in(2); |
| 4278 | + VectorNode* vector_accumulator = VectorNode::make(vopc, last_vector_accumulator, vector_input, vec_t); |
| 4279 | + register_new_node(vector_accumulator, cl); |
| 4280 | + _igvn.replace_node(current, vector_accumulator); |
| 4281 | + VectorNode::trace_new_vector(vector_accumulator, "UnorderedReduction"); |
| 4282 | + if (current == last_ur) { |
| 4283 | + break; |
| 4284 | + } |
| 4285 | + current = vector_accumulator->unique_out()->as_UnorderedReduction(); |
| 4286 | + } |
| 4287 | + |
| 4288 | + // Create post-loop reduction. |
| 4289 | + Node* last_accumulator = phi->in(2); |
| 4290 | + Node* post_loop_reduction = ReductionNode::make(sopc, nullptr, init, last_accumulator, bt); |
| 4291 | + |
| 4292 | + // Take over uses of last_accumulator that are not in the loop. |
| 4293 | + for (DUIterator i = last_accumulator->outs(); last_accumulator->has_out(i); i++) { |
| 4294 | + Node* use = last_accumulator->out(i); |
| 4295 | + if (use != phi && use != post_loop_reduction) { |
| 4296 | + assert(ctrl_or_self(use) != cl, "use must be outside loop"); |
| 4297 | + use->replace_edge(last_accumulator, post_loop_reduction, &_igvn); |
| 4298 | + --i; |
| 4299 | + } |
| 4300 | + } |
| 4301 | + register_new_node(post_loop_reduction, get_late_ctrl(post_loop_reduction, cl)); |
| 4302 | + VectorNode::trace_new_vector(post_loop_reduction, "UnorderedReduction"); |
| 4303 | + |
| 4304 | + assert(last_accumulator->outcnt() == 2, "last_accumulator has 2 uses: phi and post_loop_reduction"); |
| 4305 | + assert(post_loop_reduction->outcnt() > 0, "should have taken over all non loop uses of last_accumulator"); |
| 4306 | + assert(phi->outcnt() == 1, "accumulator is the only use of phi"); |
| 4307 | + } |
| 4308 | +} |
0 commit comments