diff --git a/src/hotspot/share/opto/loopTransform.cpp b/src/hotspot/share/opto/loopTransform.cpp index 06a8c2fecd5..20ee87a1829 100644 --- a/src/hotspot/share/opto/loopTransform.cpp +++ b/src/hotspot/share/opto/loopTransform.cpp @@ -671,76 +671,50 @@ void PhaseIdealLoop::do_peeling(IdealLoopTree *loop, Node_List &old_new) { loop->record_for_igvn(); } -// The Estimated Loop Unroll Size: UnrollFactor * (106% * BodySize + BC) + CC, -// where BC and CC are (totally) ad-hoc/magic "body" and "clone" constants, -// respectively, used to ensure that node usage estimates made are on the safe -// side, for the most part. This is a simplified version of the loop clone -// size calculation in est_loop_clone_sz(), defined for unroll factors larger -// than one (>1), performing an overflow check and returning 'UINT_MAX' in -// case of an overflow. -static uint est_loop_unroll_sz(uint factor, uint size) { - precond(0 < factor); - - uint const bc = 5; - uint const cc = 7; - uint const sz = size + (size + 15) / 16; - uint estimate = factor * (sz + bc) + cc; - - return (estimate - cc) / factor == sz + bc ? estimate : UINT_MAX; -} - -#define EMPTY_LOOP_SIZE 7 // Number of nodes in an empty loop. - //------------------------------policy_maximally_unroll------------------------ // Calculate the exact loop trip-count and return TRUE if loop can be fully, // i.e. maximally, unrolled, otherwise return FALSE. When TRUE, the estimated // node budget is also requested. -bool IdealLoopTree::policy_maximally_unroll(PhaseIdealLoop *phase) const { - CountedLoopNode *cl = _head->as_CountedLoop(); +bool IdealLoopTree::policy_maximally_unroll(PhaseIdealLoop* phase) const { + CountedLoopNode* cl = _head->as_CountedLoop(); assert(cl->is_normal_loop(), ""); if (!cl->is_valid_counted_loop()) { - return false; // Malformed counted loop + return false; // Malformed counted loop. } if (!cl->has_exact_trip_count()) { - // Trip count is not exact. - return false; + return false; // Trip count is not exact. } uint trip_count = cl->trip_count(); // Note, max_juint is used to indicate unknown trip count. assert(trip_count > 1, "one iteration loop should be optimized out already"); - assert(trip_count < max_juint, "exact trip_count should be less than max_uint."); + assert(trip_count < max_juint, "exact trip_count should be less than max_juint."); // If nodes are depleted, some transform has miscalculated its needs. assert(!phase->exceeding_node_budget(), "sanity"); - // Real policy: if we maximally unroll, does it get too big? - // Allow the unrolled mess to get larger than standard loop - // size. After all, it will no longer be a loop. - uint body_size = _body.size(); + // Allow the unrolled body to get larger than the standard loop size limit. uint unroll_limit = (uint)LoopUnrollLimit * 4; assert((intx)unroll_limit == LoopUnrollLimit * 4, "LoopUnrollLimit must fit in 32bits"); - if (trip_count > unroll_limit || body_size > unroll_limit) { + if (trip_count > unroll_limit || _body.size() > unroll_limit) { return false; } - // Take into account that after unroll conjoined heads and tails will fold, - // otherwise policy_unroll() may allow more unrolling than max unrolling. - uint new_body_size = est_loop_unroll_sz(trip_count, body_size - EMPTY_LOOP_SIZE); + uint new_body_size = est_loop_unroll_sz(trip_count); if (new_body_size == UINT_MAX) { // Check for bad estimate (overflow). return false; } - // Fully unroll a loop with few iterations regardless next conditions since - // following loop optimizations will split such loop anyway (pre-main-post). + // Fully unroll a loop with few iterations, regardless of other conditions, + // since the following (general) loop optimizations will split such loop in + // any case (into pre-main-post). if (trip_count <= 3) { return phase->may_require_nodes(new_body_size); } - if (new_body_size > unroll_limit || - // Unrolling can result in a large amount of node construction - phase->exceeding_node_budget(new_body_size)) { + // Reject if unrolling will result in too much node construction. + if (new_body_size > unroll_limit || phase->exceeding_node_budget(new_body_size)) { return false; } diff --git a/src/hotspot/share/opto/loopnode.cpp b/src/hotspot/share/opto/loopnode.cpp index a2ffc03935c..1a3970e142c 100644 --- a/src/hotspot/share/opto/loopnode.cpp +++ b/src/hotspot/share/opto/loopnode.cpp @@ -2471,6 +2471,39 @@ uint IdealLoopTree::est_loop_clone_sz(uint factor) const { assert((estimate - cc) / factor == sz + bc, "overflow"); + return estimate + est_loop_flow_merge_sz(); +} + +// The Estimated Loop (full-) Unroll Size: +// UnrollFactor * (~106% * BodySize) + CC + FanOutTerm, +// where CC is a (totally) ad-hoc/magic "clone" constant, used to ensure that +// node usage estimates made are on the safe side, for the most part. This is +// a "light" version of the loop clone size calculation (above), based on the +// assumption that most of the loop-construct overhead will be unraveled when +// (fully) unrolled. Defined for unroll factors larger or equal to one (>=1), +// including an overflow check and returning UINT_MAX in case of an overflow. +uint IdealLoopTree::est_loop_unroll_sz(uint factor) const { + + precond(factor > 0); + + // Take into account that after unroll conjoined heads and tails will fold. + uint const b0 = _body.size() - EMPTY_LOOP_SIZE; + uint const cc = 7; + uint const sz = b0 + (b0 + 15) / 16; + uint estimate = factor * sz + cc; + + if ((estimate - cc) / factor != sz) { + return UINT_MAX; + } + + return estimate + est_loop_flow_merge_sz(); +} + +// Estimate the growth effect (in nodes) of merging control and data flow when +// cloning a loop body, based on the amount of control and data flow reaching +// outside of the (current) loop body. +uint IdealLoopTree::est_loop_flow_merge_sz() const { + uint ctrl_edge_out_cnt = 0; uint data_edge_out_cnt = 0; @@ -2494,24 +2527,21 @@ uint IdealLoopTree::est_loop_clone_sz(uint factor) const { } } } - // Add data and control count (x2.0) to estimate iff both are > 0. This is + // Use data and control count (x2.0) in estimate iff both are > 0. This is // a rather pessimistic estimate for the most part, in particular for some // complex loops, but still not enough to capture all loops. if (ctrl_edge_out_cnt > 0 && data_edge_out_cnt > 0) { - estimate += 2 * (ctrl_edge_out_cnt + data_edge_out_cnt); + return 2 * (ctrl_edge_out_cnt + data_edge_out_cnt); } - - return estimate; + return 0; } #ifndef PRODUCT //------------------------------dump_head-------------------------------------- // Dump 1 liner for loop header info void IdealLoopTree::dump_head() const { - for (uint i = 0; i < _nest; i++) { - tty->print(" "); - } - tty->print("Loop: N%d/N%d ",_head->_idx,_tail->_idx); + tty->sp(2 * _nest); + tty->print("Loop: N%d/N%d ", _head->_idx, _tail->_idx); if (_irreducible) tty->print(" IRREDUCIBLE"); Node* entry = _head->is_Loop() ? _head->as_Loop()->skip_strip_mined(-1)->in(LoopNode::EntryControl) : _head->in(LoopNode::EntryControl); Node* predicate = PhaseIdealLoop::find_predicate_insertion_point(entry, Deoptimization::Reason_loop_limit_check); @@ -4501,69 +4531,67 @@ void PhaseIdealLoop::dump_bad_graph(const char* msg, Node* n, Node* early, Node* #ifndef PRODUCT //------------------------------dump------------------------------------------- -void PhaseIdealLoop::dump( ) const { +void PhaseIdealLoop::dump() const { ResourceMark rm; Arena* arena = Thread::current()->resource_area(); Node_Stack stack(arena, C->live_nodes() >> 2); Node_List rpo_list; VectorSet visited(arena); visited.set(C->top()->_idx); - rpo( C->root(), stack, visited, rpo_list ); + rpo(C->root(), stack, visited, rpo_list); // Dump root loop indexed by last element in PO order - dump( _ltree_root, rpo_list.size(), rpo_list ); + dump(_ltree_root, rpo_list.size(), rpo_list); } -void PhaseIdealLoop::dump( IdealLoopTree *loop, uint idx, Node_List &rpo_list ) const { +void PhaseIdealLoop::dump(IdealLoopTree* loop, uint idx, Node_List &rpo_list) const { loop->dump_head(); // Now scan for CFG nodes in the same loop - for( uint j=idx; j > 0; j-- ) { - Node *n = rpo_list[j-1]; - if( !_nodes[n->_idx] ) // Skip dead nodes + for (uint j = idx; j > 0; j--) { + Node* n = rpo_list[j-1]; + if (!_nodes[n->_idx]) // Skip dead nodes continue; - if( get_loop(n) != loop ) { // Wrong loop nest - if( get_loop(n)->_head == n && // Found nested loop? - get_loop(n)->_parent == loop ) - dump(get_loop(n),rpo_list.size(),rpo_list); // Print it nested-ly + + if (get_loop(n) != loop) { // Wrong loop nest + if (get_loop(n)->_head == n && // Found nested loop? + get_loop(n)->_parent == loop) + dump(get_loop(n), rpo_list.size(), rpo_list); // Print it nested-ly continue; } // Dump controlling node - for( uint x = 0; x < loop->_nest; x++ ) - tty->print(" "); + tty->sp(2 * loop->_nest); tty->print("C"); - if( n == C->root() ) { + if (n == C->root()) { n->dump(); } else { Node* cached_idom = idom_no_update(n); - Node *computed_idom = n->in(0); - if( n->is_Region() ) { + Node* computed_idom = n->in(0); + if (n->is_Region()) { computed_idom = compute_idom(n); // computed_idom() will return n->in(0) when idom(n) is an IfNode (or // any MultiBranch ctrl node), so apply a similar transform to // the cached idom returned from idom_no_update. cached_idom = find_non_split_ctrl(cached_idom); } - tty->print(" ID:%d",computed_idom->_idx); + tty->print(" ID:%d", computed_idom->_idx); n->dump(); - if( cached_idom != computed_idom ) { + if (cached_idom != computed_idom) { tty->print_cr("*** BROKEN IDOM! Computed as: %d, cached as: %d", computed_idom->_idx, cached_idom->_idx); } } // Dump nodes it controls - for( uint k = 0; k < _nodes.Size(); k++ ) { + for (uint k = 0; k < _nodes.Size(); k++) { // (k < C->unique() && get_ctrl(find(k)) == n) if (k < C->unique() && _nodes[k] == (Node*)((intptr_t)n + 1)) { - Node *m = C->root()->find(k); - if( m && m->outcnt() > 0 ) { + Node* m = C->root()->find(k); + if (m && m->outcnt() > 0) { if (!(has_ctrl(m) && get_ctrl_no_update(m) == n)) { tty->print_cr("*** BROKEN CTRL ACCESSOR! _nodes[k] is %p, ctrl is %p", _nodes[k], has_ctrl(m) ? get_ctrl_no_update(m) : NULL); } - for( uint j = 0; j < loop->_nest; j++ ) - tty->print(" "); - tty->print(" "); + tty->sp(2 * loop->_nest + 1); m->dump(); } } @@ -4574,7 +4602,7 @@ void PhaseIdealLoop::dump( IdealLoopTree *loop, uint idx, Node_List &rpo_list ) // Collect a R-P-O for the whole CFG. // Result list is in post-order (scan backwards for RPO) -void PhaseIdealLoop::rpo( Node *start, Node_Stack &stk, VectorSet &visited, Node_List &rpo_list ) const { +void PhaseIdealLoop::rpo(Node* start, Node_Stack &stk, VectorSet &visited, Node_List &rpo_list) const { stk.push(start, 0); visited.set(start->_idx); @@ -4596,7 +4624,7 @@ void PhaseIdealLoop::rpo( Node *start, Node_Stack &stk, VectorSet &visited, Node //============================================================================= -//------------------------------LoopTreeIterator----------------------------------- +//------------------------------LoopTreeIterator------------------------------- // Advance to next loop tree using a preorder, left-to-right traversal. void LoopTreeIterator::next() { diff --git a/src/hotspot/share/opto/loopnode.hpp b/src/hotspot/share/opto/loopnode.hpp index 80b7d3f4791..e12792212fd 100644 --- a/src/hotspot/share/opto/loopnode.hpp +++ b/src/hotspot/share/opto/loopnode.hpp @@ -623,6 +623,8 @@ class IdealLoopTree : public ResourceObj { // Estimate the number of nodes required when cloning a loop (body). uint est_loop_clone_sz(uint factor) const; + // Estimate the number of nodes required when unrolling a loop (body). + uint est_loop_unroll_sz(uint factor) const; // Compute loop trip count if possible void compute_trip_count(PhaseIdealLoop* phase); @@ -654,11 +656,16 @@ class IdealLoopTree : public ResourceObj { void remove_main_post_loops(CountedLoopNode *cl, PhaseIdealLoop *phase); #ifndef PRODUCT - void dump_head( ) const; // Dump loop head only + void dump_head() const; // Dump loop head only void dump() const; // Dump this loop recursively void verify_tree(IdealLoopTree *loop, const IdealLoopTree *parent) const; #endif + private: + enum { EMPTY_LOOP_SIZE = 7 }; // Number of nodes in an empty loop. + + // Estimate the number of nodes resulting from control and data flow merge. + uint est_loop_flow_merge_sz() const; }; // -----------------------------PhaseIdealLoop--------------------------------- @@ -675,7 +682,7 @@ class PhaseIdealLoop : public PhaseTransform { PhaseIterGVN &_igvn; // Head of loop tree - IdealLoopTree *_ltree_root; + IdealLoopTree* _ltree_root; // Array of pre-order numbers, plus post-visited bit. // ZERO for not pre-visited. EVEN for pre-visited but not post-visited. @@ -1017,9 +1024,9 @@ class PhaseIdealLoop : public PhaseTransform { bool _has_irreducible_loops; // Per-Node transform - virtual Node *transform( Node *a_node ) { return 0; } + virtual Node* transform(Node* n) { return 0; } - bool is_counted_loop(Node* x, IdealLoopTree*& loop); + bool is_counted_loop(Node* n, IdealLoopTree* &loop); IdealLoopTree* create_outer_strip_mined_loop(BoolNode *test, Node *cmp, Node *init_control, IdealLoopTree* loop, float cl_prob, float le_fcnt, Node*& entry_control, Node*& iffalse); @@ -1034,7 +1041,7 @@ class PhaseIdealLoop : public PhaseTransform { return (IdealLoopTree*)_nodes[n->_idx]; } - IdealLoopTree *ltree_root() const { return _ltree_root; } + IdealLoopTree* ltree_root() const { return _ltree_root; } // Is 'n' a (nested) member of 'loop'? int is_member( const IdealLoopTree *loop, Node *n ) const { @@ -1319,7 +1326,7 @@ class PhaseIdealLoop : public PhaseTransform { // same block. Split thru the Region. void do_split_if( Node *iff ); - // Conversion of fill/copy patterns into intrisic versions + // Conversion of fill/copy patterns into intrinsic versions bool do_intrinsify_fill(); bool intrinsify_fill(IdealLoopTree* lpt); bool match_fill_loop(IdealLoopTree* lpt, Node*& store, Node*& store_value, @@ -1419,18 +1426,18 @@ class PhaseIdealLoop : public PhaseTransform { public: void set_created_loop_node() { _created_loop_node = true; } bool created_loop_node() { return _created_loop_node; } - void register_new_node( Node *n, Node *blk ); + void register_new_node(Node* n, Node* blk); #ifdef ASSERT void dump_bad_graph(const char* msg, Node* n, Node* early, Node* LCA); #endif #ifndef PRODUCT - void dump( ) const; - void dump( IdealLoopTree *loop, uint rpo_idx, Node_List &rpo_list ) const; + void dump() const; + void dump(IdealLoopTree* loop, uint rpo_idx, Node_List &rpo_list) const; void verify() const; // Major slow :-) - void verify_compare( Node *n, const PhaseIdealLoop *loop_verify, VectorSet &visited ) const; - IdealLoopTree *get_loop_idx(Node* n) const { + void verify_compare(Node* n, const PhaseIdealLoop* loop_verify, VectorSet &visited) const; + IdealLoopTree* get_loop_idx(Node* n) const { // Dead nodes have no loop, so return the top level loop instead return _nodes[n->_idx] ? (IdealLoopTree*)_nodes[n->_idx] : _ltree_root; } @@ -1439,7 +1446,8 @@ class PhaseIdealLoop : public PhaseTransform { static int _loop_invokes; // Count of PhaseIdealLoop invokes static int _loop_work; // Sum of PhaseIdealLoop x _unique #endif - void rpo( Node *start, Node_Stack &stk, VectorSet &visited, Node_List &rpo_list ) const; + + void rpo(Node* start, Node_Stack &stk, VectorSet &visited, Node_List &rpo_list) const; }; diff --git a/test/hotspot/jtreg/compiler/loopopts/LoopUnrollBadNodeBudget.java b/test/hotspot/jtreg/compiler/loopopts/LoopUnrollBadNodeBudget.java new file mode 100644 index 00000000000..77a8cc0237f --- /dev/null +++ b/test/hotspot/jtreg/compiler/loopopts/LoopUnrollBadNodeBudget.java @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * @test + * @bug 8229499 + * @summary Node estimate for loop unrolling is not correct/sufficient: + * assert(delta <= 2 * required) failed: Bad node estimate ... + * + * @requires !vm.graal.enabled + * + * @run main/othervm -XX:-TieredCompilation -XX:-BackgroundCompilation + * LoopUnrollBadNodeBudget + * + */ + +public class LoopUnrollBadNodeBudget { + + int a; + long b; + int c; + int d(long e, short f, int g) { + int h, j = 2, k, l[][] = new int[a][]; + for (h = 8; h < 58; ++h) + for (k = 1; 7 > k; ++k) + switch (h % 9 * 5 + 43) { + case 70: + case 65: + case 86: + case 81: + case 62: + case 69: + case 74: + g = j; + } + long m = u(l); + return (int)m; + } + void n(int p, int o) { d(b, (short)0, p); } + void r(String[] q) { + int i = 4; + n(i, c); + } + long u(int[][] a) { + long sum = 0; + return sum; + } + public static void main(String[] t) { + try { + LoopUnrollBadNodeBudget s = new LoopUnrollBadNodeBudget(); + for (int i = 5000; i > 0; i--) + s.r(t); + } catch (Exception ex) { + } + } +}