-
Notifications
You must be signed in to change notification settings - Fork 6.2k
8340093: C2 SuperWord: implement cost model #27803
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
edf26bb
ce4ce1f
8ac7d0a
57e69df
30d916f
da3b4b3
b7b5ac0
49f9242
b32afed
a77059f
a8f11c4
693dcf1
2a9aba2
baa41e4
24a6c33
5373397
f0d9fa2
35eec33
8e4a2ce
ed16cf6
802054a
90691a8
8ecbf71
d3dad21
b825109
0de4a97
ff4c1da
af8dd43
d457b04
180d066
c2768d8
1edb758
4c14aca
18a8898
2bd9c94
a8d31d7
3f7ef58
22dab5a
d79df4f
23906b8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -42,9 +42,7 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) : | |
| ), | ||
| _vpointer_for_main_loop_alignment(nullptr), | ||
| _aw_for_main_loop_alignment(0), | ||
| _do_vector_loop(phase()->C->do_vector_loop()), // whether to do vectorization/simd style | ||
| _num_work_vecs(0), // amount of vector work we have | ||
| _num_reductions(0) // amount of reduction work we have | ||
| _do_vector_loop(phase()->C->do_vector_loop()) // whether to do vectorization/simd style | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note: part of old reduction heuristic, no longer needed. |
||
| { | ||
| } | ||
|
|
||
|
|
@@ -1567,18 +1565,6 @@ void SuperWord::filter_packs_for_implemented() { | |
|
|
||
| // Remove packs that are not profitable. | ||
| void SuperWord::filter_packs_for_profitable() { | ||
| // Count the number of reductions vs other vector ops, for the | ||
| // reduction profitability heuristic. | ||
| for (int i = 0; i < _packset.length(); i++) { | ||
| Node_List* pack = _packset.at(i); | ||
| Node* n = pack->at(0); | ||
| if (is_marked_reduction(n)) { | ||
| _num_reductions++; | ||
| } else { | ||
| _num_work_vecs++; | ||
| } | ||
| } | ||
|
|
||
| // Remove packs that are not profitable | ||
| auto filter = [&](const Node_List* pack) { | ||
| return profitable(pack); | ||
|
|
@@ -1595,31 +1581,7 @@ bool SuperWord::implemented(const Node_List* pack, const uint size) const { | |
| if (p0 != nullptr) { | ||
| int opc = p0->Opcode(); | ||
| if (is_marked_reduction(p0)) { | ||
| const Type *arith_type = p0->bottom_type(); | ||
| // This heuristic predicts that 2-element reductions for INT/LONG are not | ||
| // profitable. This heuristic was added in JDK-8078563. The argument | ||
| // was that reductions are not just a single instruction, but multiple, and | ||
| // hence it is not directly clear that they are profitable. If we only have | ||
| // two elements per vector, then the performance gains from non-reduction | ||
| // vectors are at most going from 2 scalar instructions to 1 vector instruction. | ||
| // But a 2-element reduction vector goes from 2 scalar instructions to | ||
| // 3 instructions (1 shuffle and two reduction ops). | ||
| // However, this optimization assumes that these reductions stay in the loop | ||
| // which may not be true any more in most cases after the introduction of: | ||
| // See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop | ||
| // Hence, this heuristic has room for improvement. | ||
| bool is_two_element_int_or_long_reduction = (size == 2) && | ||
| (arith_type->basic_type() == T_INT || | ||
| arith_type->basic_type() == T_LONG); | ||
| if (is_two_element_int_or_long_reduction && AutoVectorizationOverrideProfitability != 2) { | ||
| #ifndef PRODUCT | ||
| if (is_trace_superword_rejections()) { | ||
| tty->print_cr("\nPerformance heuristic: 2-element INT/LONG reduction not profitable."); | ||
| tty->print_cr(" Can override with AutoVectorizationOverrideProfitability=2"); | ||
| } | ||
| #endif | ||
| return false; | ||
| } | ||
| const Type* arith_type = p0->bottom_type(); | ||
| retValue = ReductionNode::implemented(opc, size, arith_type->basic_type()); | ||
| } else if (VectorNode::is_convert_opcode(opc)) { | ||
| retValue = VectorCastNode::implemented(opc, size, velt_basic_type(p0->in(1)), velt_basic_type(p0)); | ||
|
|
@@ -1772,26 +1734,6 @@ bool SuperWord::profitable(const Node_List* p) const { | |
| // The second input has to be the vector we wanted to reduce, | ||
| // but it was not packed. | ||
| return false; | ||
| } else if (_num_work_vecs == _num_reductions && AutoVectorizationOverrideProfitability != 2) { | ||
| // This heuristic predicts that the reduction is not profitable. | ||
| // Reduction vectors can be expensive, because they require multiple | ||
| // operations to fold all the lanes together. Hence, vectorizing the | ||
| // reduction is not profitable on its own. Hence, we need a lot of | ||
| // other "work vectors" that deliver performance improvements to | ||
| // balance out the performance loss due to reductions. | ||
| // This heuristic is a bit simplistic, and assumes that the reduction | ||
| // vector stays in the loop. But in some cases, we can move the | ||
| // reduction out of the loop, replacing it with a single vector op. | ||
| // See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop | ||
| // Hence, this heuristic has room for improvement. | ||
| #ifndef PRODUCT | ||
| if (is_trace_superword_rejections()) { | ||
| tty->print_cr("\nPerformance heuristic: not enough vectors in the loop to make"); | ||
| tty->print_cr(" reduction profitable."); | ||
| tty->print_cr(" Can override with AutoVectorizationOverrideProfitability=2"); | ||
| } | ||
| #endif | ||
| return false; | ||
| } else if (second_pk->size() != p->size()) { | ||
| return false; | ||
| } | ||
|
|
@@ -1950,19 +1892,53 @@ bool SuperWord::do_vtransform() const { | |
| vtransform.optimize(); | ||
|
|
||
| if (!vtransform.schedule()) { return false; } | ||
| if (vtransform.has_store_to_load_forwarding_failure()) { return false; } | ||
|
|
||
| if (!vtransform.is_profitable()) { return false; } | ||
|
|
||
| vtransform.apply(); | ||
| return true; | ||
| } | ||
|
|
||
| // Check Cost-Model, and other heuristics. | ||
| // Can be overridden with AutoVectorizationOverrideProfitability. | ||
| bool VTransform::is_profitable() const { | ||
| assert(_graph.is_scheduled(), "must already be scheduled"); | ||
|
|
||
| if (AutoVectorizationOverrideProfitability == 0) { | ||
| #ifndef PRODUCT | ||
| if (is_trace_superword_any()) { | ||
| if (_trace._info) { | ||
| tty->print_cr("\nForced bailout of vectorization (AutoVectorizationOverrideProfitability=0)."); | ||
|
Comment on lines
+1909
to
1910
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Side note. Consider separate RFE to change this to UL for such outputs.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Absolutely. The tricky part is that the current
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately no. I think this is what @anton-seoane worked on before.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I have taken the task again so sooner than later CompileCommand filtering for UL will be enabled for cases such as this
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, that's what I thought. For now, I'll extend the tracing the way I've been doing, and once we have UL available with method-level filtering, then I can migrate it all in one single PR :) |
||
| } | ||
| #endif | ||
| return false; | ||
| } | ||
|
|
||
| vtransform.apply(); | ||
| return true; | ||
| if (AutoVectorizationOverrideProfitability == 2) { | ||
| #ifndef PRODUCT | ||
| if (_trace._info) { | ||
| tty->print_cr("\nForced vectorization, ignoring profitability (AutoVectorizationOverrideProfitability=2)."); | ||
| } | ||
| #endif | ||
| return true; | ||
| } | ||
|
|
||
| // Note: currently we only do throughput-based cost-modeling. In the future, we could | ||
| // also implement latency-based cost-modeling and take store-to-load-forwarding | ||
| // failures into account as the latency between the load and store. This would | ||
| // allow a more precise tradeoff between the forwarding failure penalty versus | ||
| // the vectorization gains. | ||
| if (has_store_to_load_forwarding_failure()) { return false; } | ||
|
|
||
| // Cost-model | ||
| float scalar_cost = _vloop_analyzer.cost_for_scalar_loop(); | ||
| float vector_cost = cost_for_vector_loop(); | ||
| #ifndef PRODUCT | ||
| if (_trace._info) { | ||
| tty->print_cr("\nVTransform: scalar_cost = %.2f vs vector_cost = %.2f", | ||
| scalar_cost, vector_cost); | ||
| } | ||
| #endif | ||
| return vector_cost < scalar_cost; | ||
| } | ||
|
|
||
| // Apply the vectorization, i.e. we irreversibly edit the C2 graph. At this point, all | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -287,7 +287,7 @@ void VLoopVPointers::compute_and_cache_vpointers() { | |
| int pointers_idx = 0; | ||
| _body.for_each_mem([&] (MemNode* const mem, int bb_idx) { | ||
| // Placement new: construct directly into the array. | ||
| ::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop); | ||
| ::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop, _pointer_expression_nodes); | ||
| _bb_idx_to_vpointer.at_put(bb_idx, pointers_idx); | ||
| pointers_idx++; | ||
| }); | ||
|
|
@@ -541,6 +541,108 @@ void VLoopDependencyGraph::PredsIterator::next() { | |
| } | ||
| } | ||
|
|
||
| // Cost-model heuristic for nodes that do not contribute to computational | ||
| // cost inside the loop. | ||
| bool VLoopAnalyzer::has_zero_cost(Node* n) const { | ||
| // Outside body? | ||
| if (!_vloop.in_bb(n)) { return true; } | ||
|
|
||
| // Internal nodes of pointer expressions are most likely folded into | ||
| // the load / store and have no additional cost. | ||
| if (vpointers().is_in_pointer_expression(n)) { return true; } | ||
|
|
||
| // Not all AddP nodes can be detected in VPointer parsing, so | ||
| // we filter them out here. | ||
| // We don't want to explicitly model the cost of control flow, | ||
| // since we have the same CFG structure before and after | ||
| // vectorization: A loop head, a loop exit, with a backedge. | ||
| if (n->is_AddP() || // Pointer expression | ||
| n->is_CFG() || // CFG | ||
| n->is_Phi() || // CFG | ||
| n->is_Cmp() || // CFG | ||
| n->is_Bool()) { // CFG | ||
| return true; | ||
| } | ||
|
|
||
| // All other nodes have a non-zero cost. | ||
| return false; | ||
| } | ||
|
|
||
| // Compute the cost over all operations in the (scalar) loop. | ||
| float VLoopAnalyzer::cost_for_scalar_loop() const { | ||
| #ifndef PRODUCT | ||
| if (_vloop.is_trace_cost()) { | ||
| tty->print_cr("\nVLoopAnalyzer::cost_for_scalar_loop:"); | ||
| } | ||
| #endif | ||
|
|
||
| float sum = 0; | ||
| for (int j = 0; j < body().body().length(); j++) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Maybe
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, would be nice if you move
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FYI, I filed: JDK-8371391 C2 SuperWord: rename body().body() to something more understandable |
||
| Node* n = body().body().at(j); | ||
| if (!has_zero_cost(n)) { | ||
| float c = cost_for_scalar_node(n->Opcode()); | ||
| sum += c; | ||
| #ifndef PRODUCT | ||
| if (_vloop.is_trace_cost_verbose()) { | ||
| tty->print_cr(" -> cost = %.2f for %d %s", c, n->_idx, n->Name()); | ||
| } | ||
| #endif | ||
| } | ||
| } | ||
|
|
||
| #ifndef PRODUCT | ||
| if (_vloop.is_trace_cost()) { | ||
| tty->print_cr(" total_cost = %.2f", sum); | ||
| } | ||
| #endif | ||
| return sum; | ||
| } | ||
|
|
||
| // For now, we use unit cost. We might refine that in the future. | ||
| // If needed, we could also use platform specific costs, if the | ||
| // default here is not accurate enough. | ||
| float VLoopAnalyzer::cost_for_scalar_node(int opcode) const { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You need a
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will add it :)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well, I actually tried it right now, and it would take a bit of engineering at the call sites. In quite a few cases the BasicType is not immediately available. Is it ok if we ignore it for now, and only add it in once we really need it? |
||
| float c = 1; | ||
| #ifndef PRODUCT | ||
| if (_vloop.is_trace_cost()) { | ||
| tty->print_cr(" cost = %.2f opc=%s", c, NodeClassNames[opcode]); | ||
| } | ||
| #endif | ||
| return c; | ||
| } | ||
|
|
||
| // For now, we use unit cost. We might refine that in the future. | ||
| // If needed, we could also use platform specific costs, if the | ||
| // default here is not accurate enough. | ||
| float VLoopAnalyzer::cost_for_vector_node(int opcode, int vlen, BasicType bt) const { | ||
| float c = 1; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same answer as above :) |
||
| #ifndef PRODUCT | ||
| if (_vloop.is_trace_cost()) { | ||
| tty->print_cr(" cost = %.2f opc=%s vlen=%d bt=%s", | ||
| c, NodeClassNames[opcode], vlen, type2name(bt)); | ||
| } | ||
| #endif | ||
| return c; | ||
| } | ||
|
|
||
| // For now, we use unit cost, i.e. we count the number of backend instructions | ||
| // that the vtnode will use. We might refine that in the future. | ||
| // If needed, we could also use platform specific costs, if the | ||
| // default here is not accurate enough. | ||
| float VLoopAnalyzer::cost_for_vector_reduction_node(int opcode, int vlen, BasicType bt, bool requires_strict_order) const { | ||
| // Each reduction is composed of multiple instructions, each estimated with a unit cost. | ||
| // Linear: shuffle and reduce Recursive: shuffle and reduce | ||
| float c = requires_strict_order ? 2 * vlen : 2 * exact_log2(vlen); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we ask for the cost of the element-wise opcode here, something like
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To be a little more precise, the strict one should be something like: and the non-strict one would be: Maybe refactoring a little bit to make the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @merykitty Can we do that in a follow-up RFE? For now, I'd like to keep it as simple as possible. Cost-models can become arbitrarily complex. There is a bit of a trade-off between simplicity and accuracy. And we can for sure improve things in the future, this PR just lays the foundation. My goal here is to start as simple as possible, and then add complexity if there is a proven need for it. So if you/we can find a benchmark where the cost model is not accurate enough yet, provable by Would that be acceptable for you?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What exactly does Personally, I don't want to get too stuck to counting instructions, but rather getting a throughput estimate. Counting instructions is an estimate for throughput, but I don't know yet if longterm it is the best. I would like to wait a little more, and start depending on the cost model for more and more cases (extract, pack, shuffle, if-conversion, ...) and then we will run into issues along the way where the cost model is not yet accurate enough. And at that point we can think again what would produce the most accurate results.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I believe it tries to estimate the number of instructions generated by a node.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm filing an RFE now
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. JDK-8371393 |
||
| #ifndef PRODUCT | ||
| if (_vloop.is_trace_cost()) { | ||
| tty->print_cr(" cost = %.2f opc=%s vlen=%d bt=%s requires_strict_order=%s", | ||
| c, NodeClassNames[opcode], vlen, type2name(bt), | ||
| requires_strict_order ? "true" : "false"); | ||
| } | ||
| #endif | ||
| return c; | ||
| } | ||
|
|
||
| // Computing aliasing runtime check using init and last of main-loop | ||
| // ----------------------------------------------------------------- | ||
| // | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note: no functional changes, only moving
Op_MulVLup to the other cases that work the same as it. And improving some comments.