-
Notifications
You must be signed in to change notification settings - Fork 5.3k
/
gcm.cpp
2263 lines (2011 loc) · 79 KB
/
gcm.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright (c) 1997, 2021, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#include "precompiled.hpp"
#include "libadt/vectset.hpp"
#include "memory/allocation.inline.hpp"
#include "memory/resourceArea.hpp"
#include "opto/block.hpp"
#include "opto/c2compiler.hpp"
#include "opto/callnode.hpp"
#include "opto/cfgnode.hpp"
#include "opto/machnode.hpp"
#include "opto/opcodes.hpp"
#include "opto/phaseX.hpp"
#include "opto/rootnode.hpp"
#include "opto/runtime.hpp"
#include "opto/chaitin.hpp"
#include "runtime/deoptimization.hpp"
// Portions of code courtesy of Clifford Click
// Optimization - Graph Style
// To avoid float value underflow
#define MIN_BLOCK_FREQUENCY 1.e-35f
//----------------------------schedule_node_into_block-------------------------
// Insert node n into block b. Look for projections of n and make sure they
// are in b also.
void PhaseCFG::schedule_node_into_block( Node *n, Block *b ) {
// Set basic block of n, Add n to b,
map_node_to_block(n, b);
b->add_inst(n);
// After Matching, nearly any old Node may have projections trailing it.
// These are usually machine-dependent flags. In any case, they might
// float to another block below this one. Move them up.
for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
Node* use = n->fast_out(i);
if (use->is_Proj()) {
Block* buse = get_block_for_node(use);
if (buse != b) { // In wrong block?
if (buse != NULL) {
buse->find_remove(use); // Remove from wrong block
}
map_node_to_block(use, b);
b->add_inst(use);
}
}
}
}
//----------------------------replace_block_proj_ctrl-------------------------
// Nodes that have is_block_proj() nodes as their control need to use
// the appropriate Region for their actual block as their control since
// the projection will be in a predecessor block.
void PhaseCFG::replace_block_proj_ctrl( Node *n ) {
const Node *in0 = n->in(0);
assert(in0 != NULL, "Only control-dependent");
const Node *p = in0->is_block_proj();
if (p != NULL && p != n) { // Control from a block projection?
assert(!n->pinned() || n->is_MachConstantBase(), "only pinned MachConstantBase node is expected here");
// Find trailing Region
Block *pb = get_block_for_node(in0); // Block-projection already has basic block
uint j = 0;
if (pb->_num_succs != 1) { // More then 1 successor?
// Search for successor
uint max = pb->number_of_nodes();
assert( max > 1, "" );
uint start = max - pb->_num_succs;
// Find which output path belongs to projection
for (j = start; j < max; j++) {
if( pb->get_node(j) == in0 )
break;
}
assert( j < max, "must find" );
// Change control to match head of successor basic block
j -= start;
}
n->set_req(0, pb->_succs[j]->head());
}
}
bool PhaseCFG::is_dominator(Node* dom_node, Node* node) {
assert(is_CFG(node) && is_CFG(dom_node), "node and dom_node must be CFG nodes");
if (dom_node == node) {
return true;
}
Block* d = find_block_for_node(dom_node);
Block* n = find_block_for_node(node);
assert(n != NULL && d != NULL, "blocks must exist");
if (d == n) {
if (dom_node->is_block_start()) {
return true;
}
if (node->is_block_start()) {
return false;
}
if (dom_node->is_block_proj()) {
return false;
}
if (node->is_block_proj()) {
return true;
}
assert(is_control_proj_or_safepoint(node), "node must be control projection or safepoint");
assert(is_control_proj_or_safepoint(dom_node), "dom_node must be control projection or safepoint");
// Neither 'node' nor 'dom_node' is a block start or block projection.
// Check if 'dom_node' is above 'node' in the control graph.
if (is_dominating_control(dom_node, node)) {
return true;
}
#ifdef ASSERT
// If 'dom_node' does not dominate 'node' then 'node' has to dominate 'dom_node'
if (!is_dominating_control(node, dom_node)) {
node->dump();
dom_node->dump();
assert(false, "neither dom_node nor node dominates the other");
}
#endif
return false;
}
return d->dom_lca(n) == d;
}
bool PhaseCFG::is_CFG(Node* n) {
return n->is_block_proj() || n->is_block_start() || is_control_proj_or_safepoint(n);
}
bool PhaseCFG::is_control_proj_or_safepoint(Node* n) const {
bool result = (n->is_Mach() && n->as_Mach()->ideal_Opcode() == Op_SafePoint) || (n->is_Proj() && n->as_Proj()->bottom_type() == Type::CONTROL);
assert(!result || (n->is_Mach() && n->as_Mach()->ideal_Opcode() == Op_SafePoint)
|| (n->is_Proj() && n->as_Proj()->_con == 0), "If control projection, it must be projection 0");
return result;
}
Block* PhaseCFG::find_block_for_node(Node* n) const {
if (n->is_block_start() || n->is_block_proj()) {
return get_block_for_node(n);
} else {
// Walk the control graph up if 'n' is not a block start nor a block projection. In this case 'n' must be
// an unmatched control projection or a not yet matched safepoint precedence edge in the middle of a block.
assert(is_control_proj_or_safepoint(n), "must be control projection or safepoint");
Node* ctrl = n->in(0);
while (!ctrl->is_block_start()) {
ctrl = ctrl->in(0);
}
return get_block_for_node(ctrl);
}
}
// Walk up the control graph from 'n' and check if 'dom_ctrl' is found.
bool PhaseCFG::is_dominating_control(Node* dom_ctrl, Node* n) {
Node* ctrl = n->in(0);
while (!ctrl->is_block_start()) {
if (ctrl == dom_ctrl) {
return true;
}
ctrl = ctrl->in(0);
}
return false;
}
//------------------------------schedule_pinned_nodes--------------------------
// Set the basic block for Nodes pinned into blocks
void PhaseCFG::schedule_pinned_nodes(VectorSet &visited) {
// Allocate node stack of size C->live_nodes()+8 to avoid frequent realloc
GrowableArray <Node*> spstack(C->live_nodes() + 8);
spstack.push(_root);
while (spstack.is_nonempty()) {
Node* node = spstack.pop();
if (!visited.test_set(node->_idx)) { // Test node and flag it as visited
if (node->pinned() && !has_block(node)) { // Pinned? Nail it down!
assert(node->in(0), "pinned Node must have Control");
// Before setting block replace block_proj control edge
replace_block_proj_ctrl(node);
Node* input = node->in(0);
while (!input->is_block_start()) {
input = input->in(0);
}
Block* block = get_block_for_node(input); // Basic block of controlling input
schedule_node_into_block(node, block);
}
// If the node has precedence edges (added when CastPP nodes are
// removed in final_graph_reshaping), fix the control of the
// node to cover the precedence edges and remove the
// dependencies.
Node* n = NULL;
for (uint i = node->len()-1; i >= node->req(); i--) {
Node* m = node->in(i);
if (m == NULL) continue;
// Only process precedence edges that are CFG nodes. Safepoints and control projections can be in the middle of a block
if (is_CFG(m)) {
node->rm_prec(i);
if (n == NULL) {
n = m;
} else {
assert(is_dominator(n, m) || is_dominator(m, n), "one must dominate the other");
n = is_dominator(n, m) ? m : n;
}
} else {
assert(node->is_Mach(), "sanity");
assert(node->as_Mach()->ideal_Opcode() == Op_StoreCM, "must be StoreCM node");
}
}
if (n != NULL) {
assert(node->in(0), "control should have been set");
assert(is_dominator(n, node->in(0)) || is_dominator(node->in(0), n), "one must dominate the other");
if (!is_dominator(n, node->in(0))) {
node->set_req(0, n);
}
}
// process all inputs that are non NULL
for (int i = node->req()-1; i >= 0; --i) {
if (node->in(i) != NULL) {
spstack.push(node->in(i));
}
}
}
}
}
#ifdef ASSERT
// Assert that new input b2 is dominated by all previous inputs.
// Check this by by seeing that it is dominated by b1, the deepest
// input observed until b2.
static void assert_dom(Block* b1, Block* b2, Node* n, const PhaseCFG* cfg) {
if (b1 == NULL) return;
assert(b1->_dom_depth < b2->_dom_depth, "sanity");
Block* tmp = b2;
while (tmp != b1 && tmp != NULL) {
tmp = tmp->_idom;
}
if (tmp != b1) {
// Detected an unschedulable graph. Print some nice stuff and die.
tty->print_cr("!!! Unschedulable graph !!!");
for (uint j=0; j<n->len(); j++) { // For all inputs
Node* inn = n->in(j); // Get input
if (inn == NULL) continue; // Ignore NULL, missing inputs
Block* inb = cfg->get_block_for_node(inn);
tty->print("B%d idom=B%d depth=%2d ",inb->_pre_order,
inb->_idom ? inb->_idom->_pre_order : 0, inb->_dom_depth);
inn->dump();
}
tty->print("Failing node: ");
n->dump();
assert(false, "unscheduable graph");
}
}
#endif
static Block* find_deepest_input(Node* n, const PhaseCFG* cfg) {
// Find the last input dominated by all other inputs.
Block* deepb = NULL; // Deepest block so far
int deepb_dom_depth = 0;
for (uint k = 0; k < n->len(); k++) { // For all inputs
Node* inn = n->in(k); // Get input
if (inn == NULL) continue; // Ignore NULL, missing inputs
Block* inb = cfg->get_block_for_node(inn);
assert(inb != NULL, "must already have scheduled this input");
if (deepb_dom_depth < (int) inb->_dom_depth) {
// The new inb must be dominated by the previous deepb.
// The various inputs must be linearly ordered in the dom
// tree, or else there will not be a unique deepest block.
DEBUG_ONLY(assert_dom(deepb, inb, n, cfg));
deepb = inb; // Save deepest block
deepb_dom_depth = deepb->_dom_depth;
}
}
assert(deepb != NULL, "must be at least one input to n");
return deepb;
}
//------------------------------schedule_early---------------------------------
// Find the earliest Block any instruction can be placed in. Some instructions
// are pinned into Blocks. Unpinned instructions can appear in last block in
// which all their inputs occur.
bool PhaseCFG::schedule_early(VectorSet &visited, Node_Stack &roots) {
// Allocate stack with enough space to avoid frequent realloc
Node_Stack nstack(roots.size() + 8);
// _root will be processed among C->top() inputs
roots.push(C->top(), 0);
visited.set(C->top()->_idx);
while (roots.size() != 0) {
// Use local variables nstack_top_n & nstack_top_i to cache values
// on stack's top.
Node* parent_node = roots.node();
uint input_index = 0;
roots.pop();
while (true) {
if (input_index == 0) {
// Fixup some control. Constants without control get attached
// to root and nodes that use is_block_proj() nodes should be attached
// to the region that starts their block.
const Node* control_input = parent_node->in(0);
if (control_input != NULL) {
replace_block_proj_ctrl(parent_node);
} else {
// Is a constant with NO inputs?
if (parent_node->req() == 1) {
parent_node->set_req(0, _root);
}
}
}
// First, visit all inputs and force them to get a block. If an
// input is already in a block we quit following inputs (to avoid
// cycles). Instead we put that Node on a worklist to be handled
// later (since IT'S inputs may not have a block yet).
// Assume all n's inputs will be processed
bool done = true;
while (input_index < parent_node->len()) {
Node* in = parent_node->in(input_index++);
if (in == NULL) {
continue;
}
int is_visited = visited.test_set(in->_idx);
if (!has_block(in)) {
if (is_visited) {
assert(false, "graph should be schedulable");
return false;
}
// Save parent node and next input's index.
nstack.push(parent_node, input_index);
// Process current input now.
parent_node = in;
input_index = 0;
// Not all n's inputs processed.
done = false;
break;
} else if (!is_visited) {
// Visit this guy later, using worklist
roots.push(in, 0);
}
}
if (done) {
// All of n's inputs have been processed, complete post-processing.
// Some instructions are pinned into a block. These include Region,
// Phi, Start, Return, and other control-dependent instructions and
// any projections which depend on them.
if (!parent_node->pinned()) {
// Set earliest legal block.
Block* earliest_block = find_deepest_input(parent_node, this);
map_node_to_block(parent_node, earliest_block);
} else {
assert(get_block_for_node(parent_node) == get_block_for_node(parent_node->in(0)), "Pinned Node should be at the same block as its control edge");
}
if (nstack.is_empty()) {
// Finished all nodes on stack.
// Process next node on the worklist 'roots'.
break;
}
// Get saved parent node and next input's index.
parent_node = nstack.node();
input_index = nstack.index();
nstack.pop();
}
}
}
return true;
}
//------------------------------dom_lca----------------------------------------
// Find least common ancestor in dominator tree
// LCA is a current notion of LCA, to be raised above 'this'.
// As a convenient boundary condition, return 'this' if LCA is NULL.
// Find the LCA of those two nodes.
Block* Block::dom_lca(Block* LCA) {
if (LCA == NULL || LCA == this) return this;
Block* anc = this;
while (anc->_dom_depth > LCA->_dom_depth)
anc = anc->_idom; // Walk up till anc is as high as LCA
while (LCA->_dom_depth > anc->_dom_depth)
LCA = LCA->_idom; // Walk up till LCA is as high as anc
while (LCA != anc) { // Walk both up till they are the same
LCA = LCA->_idom;
anc = anc->_idom;
}
return LCA;
}
//--------------------------raise_LCA_above_use--------------------------------
// We are placing a definition, and have been given a def->use edge.
// The definition must dominate the use, so move the LCA upward in the
// dominator tree to dominate the use. If the use is a phi, adjust
// the LCA only with the phi input paths which actually use this def.
static Block* raise_LCA_above_use(Block* LCA, Node* use, Node* def, const PhaseCFG* cfg) {
Block* buse = cfg->get_block_for_node(use);
if (buse == NULL) return LCA; // Unused killing Projs have no use block
if (!use->is_Phi()) return buse->dom_lca(LCA);
uint pmax = use->req(); // Number of Phi inputs
// Why does not this loop just break after finding the matching input to
// the Phi? Well...it's like this. I do not have true def-use/use-def
// chains. Means I cannot distinguish, from the def-use direction, which
// of many use-defs lead from the same use to the same def. That is, this
// Phi might have several uses of the same def. Each use appears in a
// different predecessor block. But when I enter here, I cannot distinguish
// which use-def edge I should find the predecessor block for. So I find
// them all. Means I do a little extra work if a Phi uses the same value
// more than once.
for (uint j=1; j<pmax; j++) { // For all inputs
if (use->in(j) == def) { // Found matching input?
Block* pred = cfg->get_block_for_node(buse->pred(j));
LCA = pred->dom_lca(LCA);
}
}
return LCA;
}
//----------------------------raise_LCA_above_marks----------------------------
// Return a new LCA that dominates LCA and any of its marked predecessors.
// Search all my parents up to 'early' (exclusive), looking for predecessors
// which are marked with the given index. Return the LCA (in the dom tree)
// of all marked blocks. If there are none marked, return the original
// LCA.
static Block* raise_LCA_above_marks(Block* LCA, node_idx_t mark, Block* early, const PhaseCFG* cfg) {
Block_List worklist;
worklist.push(LCA);
while (worklist.size() > 0) {
Block* mid = worklist.pop();
if (mid == early) continue; // stop searching here
// Test and set the visited bit.
if (mid->raise_LCA_visited() == mark) continue; // already visited
// Don't process the current LCA, otherwise the search may terminate early
if (mid != LCA && mid->raise_LCA_mark() == mark) {
// Raise the LCA.
LCA = mid->dom_lca(LCA);
if (LCA == early) break; // stop searching everywhere
assert(early->dominates(LCA), "early is high enough");
// Resume searching at that point, skipping intermediate levels.
worklist.push(LCA);
if (LCA == mid)
continue; // Don't mark as visited to avoid early termination.
} else {
// Keep searching through this block's predecessors.
for (uint j = 1, jmax = mid->num_preds(); j < jmax; j++) {
Block* mid_parent = cfg->get_block_for_node(mid->pred(j));
worklist.push(mid_parent);
}
}
mid->set_raise_LCA_visited(mark);
}
return LCA;
}
//--------------------------memory_early_block--------------------------------
// This is a variation of find_deepest_input, the heart of schedule_early.
// Find the "early" block for a load, if we considered only memory and
// address inputs, that is, if other data inputs were ignored.
//
// Because a subset of edges are considered, the resulting block will
// be earlier (at a shallower dom_depth) than the true schedule_early
// point of the node. We compute this earlier block as a more permissive
// site for anti-dependency insertion, but only if subsume_loads is enabled.
static Block* memory_early_block(Node* load, Block* early, const PhaseCFG* cfg) {
Node* base;
Node* index;
Node* store = load->in(MemNode::Memory);
load->as_Mach()->memory_inputs(base, index);
assert(base != NodeSentinel && index != NodeSentinel,
"unexpected base/index inputs");
Node* mem_inputs[4];
int mem_inputs_length = 0;
if (base != NULL) mem_inputs[mem_inputs_length++] = base;
if (index != NULL) mem_inputs[mem_inputs_length++] = index;
if (store != NULL) mem_inputs[mem_inputs_length++] = store;
// In the comparision below, add one to account for the control input,
// which may be null, but always takes up a spot in the in array.
if (mem_inputs_length + 1 < (int) load->req()) {
// This "load" has more inputs than just the memory, base and index inputs.
// For purposes of checking anti-dependences, we need to start
// from the early block of only the address portion of the instruction,
// and ignore other blocks that may have factored into the wider
// schedule_early calculation.
if (load->in(0) != NULL) mem_inputs[mem_inputs_length++] = load->in(0);
Block* deepb = NULL; // Deepest block so far
int deepb_dom_depth = 0;
for (int i = 0; i < mem_inputs_length; i++) {
Block* inb = cfg->get_block_for_node(mem_inputs[i]);
if (deepb_dom_depth < (int) inb->_dom_depth) {
// The new inb must be dominated by the previous deepb.
// The various inputs must be linearly ordered in the dom
// tree, or else there will not be a unique deepest block.
DEBUG_ONLY(assert_dom(deepb, inb, load, cfg));
deepb = inb; // Save deepest block
deepb_dom_depth = deepb->_dom_depth;
}
}
early = deepb;
}
return early;
}
//--------------------------insert_anti_dependences---------------------------
// A load may need to witness memory that nearby stores can overwrite.
// For each nearby store, either insert an "anti-dependence" edge
// from the load to the store, or else move LCA upward to force the
// load to (eventually) be scheduled in a block above the store.
//
// Do not add edges to stores on distinct control-flow paths;
// only add edges to stores which might interfere.
//
// Return the (updated) LCA. There will not be any possibly interfering
// store between the load's "early block" and the updated LCA.
// Any stores in the updated LCA will have new precedence edges
// back to the load. The caller is expected to schedule the load
// in the LCA, in which case the precedence edges will make LCM
// preserve anti-dependences. The caller may also hoist the load
// above the LCA, if it is not the early block.
Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) {
assert(load->needs_anti_dependence_check(), "must be a load of some sort");
assert(LCA != NULL, "");
DEBUG_ONLY(Block* LCA_orig = LCA);
// Compute the alias index. Loads and stores with different alias indices
// do not need anti-dependence edges.
int load_alias_idx = C->get_alias_index(load->adr_type());
#ifdef ASSERT
assert(Compile::AliasIdxTop <= load_alias_idx && load_alias_idx < C->num_alias_types(), "Invalid alias index");
if (load_alias_idx == Compile::AliasIdxBot && C->AliasLevel() > 0 &&
(PrintOpto || VerifyAliases ||
(PrintMiscellaneous && (WizardMode || Verbose)))) {
// Load nodes should not consume all of memory.
// Reporting a bottom type indicates a bug in adlc.
// If some particular type of node validly consumes all of memory,
// sharpen the preceding "if" to exclude it, so we can catch bugs here.
tty->print_cr("*** Possible Anti-Dependence Bug: Load consumes all of memory.");
load->dump(2);
if (VerifyAliases) assert(load_alias_idx != Compile::AliasIdxBot, "");
}
#endif
if (!C->alias_type(load_alias_idx)->is_rewritable()) {
// It is impossible to spoil this load by putting stores before it,
// because we know that the stores will never update the value
// which 'load' must witness.
return LCA;
}
node_idx_t load_index = load->_idx;
// Note the earliest legal placement of 'load', as determined by
// by the unique point in the dom tree where all memory effects
// and other inputs are first available. (Computed by schedule_early.)
// For normal loads, 'early' is the shallowest place (dom graph wise)
// to look for anti-deps between this load and any store.
Block* early = get_block_for_node(load);
// If we are subsuming loads, compute an "early" block that only considers
// memory or address inputs. This block may be different than the
// schedule_early block in that it could be at an even shallower depth in the
// dominator tree, and allow for a broader discovery of anti-dependences.
if (C->subsume_loads()) {
early = memory_early_block(load, early, this);
}
ResourceArea *area = Thread::current()->resource_area();
Node_List worklist_mem(area); // prior memory state to store
Node_List worklist_store(area); // possible-def to explore
Node_List worklist_visited(area); // visited mergemem nodes
Node_List non_early_stores(area); // all relevant stores outside of early
bool must_raise_LCA = false;
// 'load' uses some memory state; look for users of the same state.
// Recurse through MergeMem nodes to the stores that use them.
// Each of these stores is a possible definition of memory
// that 'load' needs to use. We need to force 'load'
// to occur before each such store. When the store is in
// the same block as 'load', we insert an anti-dependence
// edge load->store.
// The relevant stores "nearby" the load consist of a tree rooted
// at initial_mem, with internal nodes of type MergeMem.
// Therefore, the branches visited by the worklist are of this form:
// initial_mem -> (MergeMem ->)* store
// The anti-dependence constraints apply only to the fringe of this tree.
Node* initial_mem = load->in(MemNode::Memory);
worklist_store.push(initial_mem);
worklist_visited.push(initial_mem);
worklist_mem.push(NULL);
while (worklist_store.size() > 0) {
// Examine a nearby store to see if it might interfere with our load.
Node* mem = worklist_mem.pop();
Node* store = worklist_store.pop();
uint op = store->Opcode();
// MergeMems do not directly have anti-deps.
// Treat them as internal nodes in a forward tree of memory states,
// the leaves of which are each a 'possible-def'.
if (store == initial_mem // root (exclusive) of tree we are searching
|| op == Op_MergeMem // internal node of tree we are searching
) {
mem = store; // It's not a possibly interfering store.
if (store == initial_mem)
initial_mem = NULL; // only process initial memory once
for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
store = mem->fast_out(i);
if (store->is_MergeMem()) {
// Be sure we don't get into combinatorial problems.
// (Allow phis to be repeated; they can merge two relevant states.)
uint j = worklist_visited.size();
for (; j > 0; j--) {
if (worklist_visited.at(j-1) == store) break;
}
if (j > 0) continue; // already on work list; do not repeat
worklist_visited.push(store);
}
worklist_mem.push(mem);
worklist_store.push(store);
}
continue;
}
if (op == Op_MachProj || op == Op_Catch) continue;
if (store->needs_anti_dependence_check()) continue; // not really a store
// Compute the alias index. Loads and stores with different alias
// indices do not need anti-dependence edges. Wide MemBar's are
// anti-dependent on everything (except immutable memories).
const TypePtr* adr_type = store->adr_type();
if (!C->can_alias(adr_type, load_alias_idx)) continue;
// Most slow-path runtime calls do NOT modify Java memory, but
// they can block and so write Raw memory.
if (store->is_Mach()) {
MachNode* mstore = store->as_Mach();
if (load_alias_idx != Compile::AliasIdxRaw) {
// Check for call into the runtime using the Java calling
// convention (and from there into a wrapper); it has no
// _method. Can't do this optimization for Native calls because
// they CAN write to Java memory.
if (mstore->ideal_Opcode() == Op_CallStaticJava) {
assert(mstore->is_MachSafePoint(), "");
MachSafePointNode* ms = (MachSafePointNode*) mstore;
assert(ms->is_MachCallJava(), "");
MachCallJavaNode* mcj = (MachCallJavaNode*) ms;
if (mcj->_method == NULL) {
// These runtime calls do not write to Java visible memory
// (other than Raw) and so do not require anti-dependence edges.
continue;
}
}
// Same for SafePoints: they read/write Raw but only read otherwise.
// This is basically a workaround for SafePoints only defining control
// instead of control + memory.
if (mstore->ideal_Opcode() == Op_SafePoint)
continue;
} else {
// Some raw memory, such as the load of "top" at an allocation,
// can be control dependent on the previous safepoint. See
// comments in GraphKit::allocate_heap() about control input.
// Inserting an anti-dep between such a safepoint and a use
// creates a cycle, and will cause a subsequent failure in
// local scheduling. (BugId 4919904)
// (%%% How can a control input be a safepoint and not a projection??)
if (mstore->ideal_Opcode() == Op_SafePoint && load->in(0) == mstore)
continue;
}
}
// Identify a block that the current load must be above,
// or else observe that 'store' is all the way up in the
// earliest legal block for 'load'. In the latter case,
// immediately insert an anti-dependence edge.
Block* store_block = get_block_for_node(store);
assert(store_block != NULL, "unused killing projections skipped above");
if (store->is_Phi()) {
// Loop-phis need to raise load before input. (Other phis are treated
// as store below.)
//
// 'load' uses memory which is one (or more) of the Phi's inputs.
// It must be scheduled not before the Phi, but rather before
// each of the relevant Phi inputs.
//
// Instead of finding the LCA of all inputs to a Phi that match 'mem',
// we mark each corresponding predecessor block and do a combined
// hoisting operation later (raise_LCA_above_marks).
//
// Do not assert(store_block != early, "Phi merging memory after access")
// PhiNode may be at start of block 'early' with backedge to 'early'
DEBUG_ONLY(bool found_match = false);
for (uint j = PhiNode::Input, jmax = store->req(); j < jmax; j++) {
if (store->in(j) == mem) { // Found matching input?
DEBUG_ONLY(found_match = true);
Block* pred_block = get_block_for_node(store_block->pred(j));
if (pred_block != early) {
// If any predecessor of the Phi matches the load's "early block",
// we do not need a precedence edge between the Phi and 'load'
// since the load will be forced into a block preceding the Phi.
pred_block->set_raise_LCA_mark(load_index);
assert(!LCA_orig->dominates(pred_block) ||
early->dominates(pred_block), "early is high enough");
must_raise_LCA = true;
} else {
// anti-dependent upon PHI pinned below 'early', no edge needed
LCA = early; // but can not schedule below 'early'
}
}
}
assert(found_match, "no worklist bug");
} else if (store_block != early) {
// 'store' is between the current LCA and earliest possible block.
// Label its block, and decide later on how to raise the LCA
// to include the effect on LCA of this store.
// If this store's block gets chosen as the raised LCA, we
// will find him on the non_early_stores list and stick him
// with a precedence edge.
// (But, don't bother if LCA is already raised all the way.)
if (LCA != early) {
store_block->set_raise_LCA_mark(load_index);
must_raise_LCA = true;
non_early_stores.push(store);
}
} else {
// Found a possibly-interfering store in the load's 'early' block.
// This means 'load' cannot sink at all in the dominator tree.
// Add an anti-dep edge, and squeeze 'load' into the highest block.
assert(store != load->find_exact_control(load->in(0)), "dependence cycle found");
if (verify) {
#ifdef ASSERT
// We expect an anti-dependence edge from 'load' to 'store', except when
// implicit_null_check() has hoisted 'store' above its early block to
// perform an implicit null check, and 'load' is placed in the null
// block. In this case it is safe to ignore the anti-dependence, as the
// null block is only reached if 'store' tries to write to null.
Block* store_null_block = NULL;
Node* store_null_check = store->find_out_with(Op_MachNullCheck);
if (store_null_check != NULL) {
Node* if_true = store_null_check->find_out_with(Op_IfTrue);
assert(if_true != NULL, "null check without null projection");
Node* null_block_region = if_true->find_out_with(Op_Region);
assert(null_block_region != NULL, "null check without null region");
store_null_block = get_block_for_node(null_block_region);
}
#endif
assert(LCA == store_null_block || store->find_edge(load) != -1,
"missing precedence edge");
} else {
store->add_prec(load);
}
LCA = early;
// This turns off the process of gathering non_early_stores.
}
}
// (Worklist is now empty; all nearby stores have been visited.)
// Finished if 'load' must be scheduled in its 'early' block.
// If we found any stores there, they have already been given
// precedence edges.
if (LCA == early) return LCA;
// We get here only if there are no possibly-interfering stores
// in the load's 'early' block. Move LCA up above all predecessors
// which contain stores we have noted.
//
// The raised LCA block can be a home to such interfering stores,
// but its predecessors must not contain any such stores.
//
// The raised LCA will be a lower bound for placing the load,
// preventing the load from sinking past any block containing
// a store that may invalidate the memory state required by 'load'.
if (must_raise_LCA)
LCA = raise_LCA_above_marks(LCA, load->_idx, early, this);
if (LCA == early) return LCA;
// Insert anti-dependence edges from 'load' to each store
// in the non-early LCA block.
// Mine the non_early_stores list for such stores.
if (LCA->raise_LCA_mark() == load_index) {
while (non_early_stores.size() > 0) {
Node* store = non_early_stores.pop();
Block* store_block = get_block_for_node(store);
if (store_block == LCA) {
// add anti_dependence from store to load in its own block
assert(store != load->find_exact_control(load->in(0)), "dependence cycle found");
if (verify) {
assert(store->find_edge(load) != -1, "missing precedence edge");
} else {
store->add_prec(load);
}
} else {
assert(store_block->raise_LCA_mark() == load_index, "block was marked");
// Any other stores we found must be either inside the new LCA
// or else outside the original LCA. In the latter case, they
// did not interfere with any use of 'load'.
assert(LCA->dominates(store_block)
|| !LCA_orig->dominates(store_block), "no stray stores");
}
}
}
// Return the highest block containing stores; any stores
// within that block have been given anti-dependence edges.
return LCA;
}
// This class is used to iterate backwards over the nodes in the graph.
class Node_Backward_Iterator {
private:
Node_Backward_Iterator();
public:
// Constructor for the iterator
Node_Backward_Iterator(Node *root, VectorSet &visited, Node_Stack &stack, PhaseCFG &cfg);
// Postincrement operator to iterate over the nodes
Node *next();
private:
VectorSet &_visited;
Node_Stack &_stack;
PhaseCFG &_cfg;
};
// Constructor for the Node_Backward_Iterator
Node_Backward_Iterator::Node_Backward_Iterator( Node *root, VectorSet &visited, Node_Stack &stack, PhaseCFG &cfg)
: _visited(visited), _stack(stack), _cfg(cfg) {
// The stack should contain exactly the root
stack.clear();
stack.push(root, root->outcnt());
// Clear the visited bits
visited.clear();
}
// Iterator for the Node_Backward_Iterator
Node *Node_Backward_Iterator::next() {
// If the _stack is empty, then just return NULL: finished.
if ( !_stack.size() )
return NULL;
// I visit unvisited not-anti-dependence users first, then anti-dependent
// children next. I iterate backwards to support removal of nodes.
// The stack holds states consisting of 3 values:
// current Def node, flag which indicates 1st/2nd pass, index of current out edge
Node *self = (Node*)(((uintptr_t)_stack.node()) & ~1);
bool iterate_anti_dep = (((uintptr_t)_stack.node()) & 1);
uint idx = MIN2(_stack.index(), self->outcnt()); // Support removal of nodes.
_stack.pop();
// I cycle here when I am entering a deeper level of recursion.
// The key variable 'self' was set prior to jumping here.
while( 1 ) {
_visited.set(self->_idx);
// Now schedule all uses as late as possible.
const Node* src = self->is_Proj() ? self->in(0) : self;
uint src_rpo = _cfg.get_block_for_node(src)->_rpo;
// Schedule all nodes in a post-order visit
Node *unvisited = NULL; // Unvisited anti-dependent Node, if any
// Scan for unvisited nodes
while (idx > 0) {
// For all uses, schedule late
Node* n = self->raw_out(--idx); // Use
// Skip already visited children
if ( _visited.test(n->_idx) )
continue;
// do not traverse backward control edges
Node *use = n->is_Proj() ? n->in(0) : n;
uint use_rpo = _cfg.get_block_for_node(use)->_rpo;
if ( use_rpo < src_rpo )
continue;
// Phi nodes always precede uses in a basic block
if ( use_rpo == src_rpo && use->is_Phi() )
continue;
unvisited = n; // Found unvisited
// Check for possible-anti-dependent
// 1st pass: No such nodes, 2nd pass: Only such nodes.
if (n->needs_anti_dependence_check() == iterate_anti_dep) {
unvisited = n; // Found unvisited
break;
}
}
// Did I find an unvisited not-anti-dependent Node?
if (!unvisited) {
if (!iterate_anti_dep) {
// 2nd pass: Iterate over nodes which needs_anti_dependence_check.
iterate_anti_dep = true;
idx = self->outcnt();
continue;
}
break; // All done with children; post-visit 'self'
}
// Visit the unvisited Node. Contains the obvious push to
// indicate I'm entering a deeper level of recursion. I push the
// old state onto the _stack and set a new state and loop (recurse).
_stack.push((Node*)((uintptr_t)self | (uintptr_t)iterate_anti_dep), idx);
self = unvisited;
iterate_anti_dep = false;
idx = self->outcnt();
} // End recursion loop
return self;
}
//------------------------------ComputeLatenciesBackwards----------------------
// Compute the latency of all the instructions.
void PhaseCFG::compute_latencies_backwards(VectorSet &visited, Node_Stack &stack) {
#ifndef PRODUCT
if (trace_opto_pipelining())
tty->print("\n#---- ComputeLatenciesBackwards ----\n");
#endif
Node_Backward_Iterator iter((Node *)_root, visited, stack, *this);
Node *n;
// Walk over all the nodes from last to first
while ((n = iter.next())) {
// Set the latency for the definitions of this instruction
partial_latency_of_defs(n);
}
} // end ComputeLatenciesBackwards
//------------------------------partial_latency_of_defs------------------------
// Compute the latency impact of this node on all defs. This computes
// a number that increases as we approach the beginning of the routine.
void PhaseCFG::partial_latency_of_defs(Node *n) {
// Set the latency for this instruction
#ifndef PRODUCT
if (trace_opto_pipelining()) {
tty->print("# latency_to_inputs: node_latency[%d] = %d for node", n->_idx, get_latency_for_node(n));
dump();
}
#endif
if (n->is_Proj()) {
n = n->in(0);
}
if (n->is_Root()) {
return;
}