-
Notifications
You must be signed in to change notification settings - Fork 5.3k
/
superword.cpp
4861 lines (4457 loc) · 169 KB
/
superword.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright (c) 2007, 2021, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
#include "precompiled.hpp"
#include "compiler/compileLog.hpp"
#include "libadt/vectset.hpp"
#include "memory/allocation.inline.hpp"
#include "memory/resourceArea.hpp"
#include "opto/addnode.hpp"
#include "opto/callnode.hpp"
#include "opto/castnode.hpp"
#include "opto/convertnode.hpp"
#include "opto/divnode.hpp"
#include "opto/matcher.hpp"
#include "opto/memnode.hpp"
#include "opto/mulnode.hpp"
#include "opto/opcodes.hpp"
#include "opto/opaquenode.hpp"
#include "opto/superword.hpp"
#include "opto/vectornode.hpp"
#include "opto/movenode.hpp"
#include "utilities/powerOfTwo.hpp"
//
// S U P E R W O R D T R A N S F O R M
//=============================================================================
//------------------------------SuperWord---------------------------
SuperWord::SuperWord(PhaseIdealLoop* phase) :
_phase(phase),
_arena(phase->C->comp_arena()),
_igvn(phase->_igvn),
_packset(arena(), 8, 0, NULL), // packs for the current block
_bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb
_block(arena(), 8, 0, NULL), // nodes in current block
_post_block(arena(), 8, 0, NULL), // nodes common to current block which are marked as post loop vectorizable
_data_entry(arena(), 8, 0, NULL), // nodes with all inputs from outside
_mem_slice_head(arena(), 8, 0, NULL), // memory slice heads
_mem_slice_tail(arena(), 8, 0, NULL), // memory slice tails
_node_info(arena(), 8, 0, SWNodeInfo::initial), // info needed per node
_clone_map(phase->C->clone_map()), // map of nodes created in cloning
_cmovev_kit(_arena, this), // map to facilitate CMoveV creation
_align_to_ref(NULL), // memory reference to align vectors to
_disjoint_ptrs(arena(), 8, 0, OrderedPair::initial), // runtime disambiguated pointer pairs
_dg(_arena), // dependence graph
_visited(arena()), // visited node set
_post_visited(arena()), // post visited node set
_n_idx_list(arena(), 8), // scratch list of (node,index) pairs
_nlist(arena(), 8, 0, NULL), // scratch list of nodes
_stk(arena(), 8, 0, NULL), // scratch stack of nodes
_lpt(NULL), // loop tree node
_lp(NULL), // CountedLoopNode
_pre_loop_end(NULL), // Pre loop CountedLoopEndNode
_bb(NULL), // basic block
_iv(NULL), // induction var
_race_possible(false), // cases where SDMU is true
_early_return(true), // analysis evaluations routine
_do_vector_loop(phase->C->do_vector_loop()), // whether to do vectorization/simd style
_do_reserve_copy(DoReserveCopyInSuperWord),
_num_work_vecs(0), // amount of vector work we have
_num_reductions(0), // amount of reduction work we have
_ii_first(-1), // first loop generation index - only if do_vector_loop()
_ii_last(-1), // last loop generation index - only if do_vector_loop()
_ii_order(arena(), 8, 0, 0)
{
#ifndef PRODUCT
_vector_loop_debug = 0;
if (_phase->C->method() != NULL) {
_vector_loop_debug = phase->C->directive()->VectorizeDebugOption;
}
#endif
}
static const bool _do_vector_loop_experimental = false; // Experimental vectorization which uses data from loop unrolling.
//------------------------------transform_loop---------------------------
void SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
assert(UseSuperWord, "should be");
// SuperWord only works with power of two vector sizes.
int vector_width = Matcher::vector_width_in_bytes(T_BYTE);
if (vector_width < 2 || !is_power_of_2(vector_width)) {
return;
}
assert(lpt->_head->is_CountedLoop(), "must be");
CountedLoopNode *cl = lpt->_head->as_CountedLoop();
if (!cl->is_valid_counted_loop(T_INT)) return; // skip malformed counted loop
bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
if (post_loop_allowed) {
if (cl->is_reduction_loop()) return; // no predication mapping
Node *limit = cl->limit();
if (limit->is_Con()) return; // non constant limits only
// Now check the limit for expressions we do not handle
if (limit->is_Add()) {
Node *in2 = limit->in(2);
if (in2->is_Con()) {
int val = in2->get_int();
// should not try to program these cases
if (val < 0) return;
}
}
}
// skip any loop that has not been assigned max unroll by analysis
if (do_optimization) {
if (SuperWordLoopUnrollAnalysis && cl->slp_max_unroll() == 0) return;
}
// Check for no control flow in body (other than exit)
Node *cl_exit = cl->loopexit();
if (cl->is_main_loop() && (cl_exit->in(0) != lpt->_head)) {
#ifndef PRODUCT
if (TraceSuperWord) {
tty->print_cr("SuperWord::transform_loop: loop too complicated, cl_exit->in(0) != lpt->_head");
tty->print("cl_exit %d", cl_exit->_idx); cl_exit->dump();
tty->print("cl_exit->in(0) %d", cl_exit->in(0)->_idx); cl_exit->in(0)->dump();
tty->print("lpt->_head %d", lpt->_head->_idx); lpt->_head->dump();
lpt->dump_head();
}
#endif
return;
}
// Make sure the are no extra control users of the loop backedge
if (cl->back_control()->outcnt() != 1) {
return;
}
// Skip any loops already optimized by slp
if (cl->is_vectorized_loop()) return;
if (cl->is_unroll_only()) return;
if (cl->is_main_loop()) {
// Check for pre-loop ending with CountedLoopEnd(Bool(Cmp(x,Opaque1(limit))))
CountedLoopEndNode* pre_end = find_pre_loop_end(cl);
if (pre_end == NULL) {
return;
}
Node* pre_opaq1 = pre_end->limit();
if (pre_opaq1->Opcode() != Op_Opaque1) {
return;
}
set_pre_loop_end(pre_end);
}
init(); // initialize data structures
set_lpt(lpt);
set_lp(cl);
// For now, define one block which is the entire loop body
set_bb(cl);
if (do_optimization) {
assert(_packset.length() == 0, "packset must be empty");
SLP_extract();
if (PostLoopMultiversioning && Matcher::has_predicated_vectors()) {
if (cl->is_vectorized_loop() && cl->is_main_loop() && !cl->is_reduction_loop()) {
IdealLoopTree *lpt_next = lpt->_next;
CountedLoopNode *cl_next = lpt_next->_head->as_CountedLoop();
_phase->has_range_checks(lpt_next);
if (cl_next->is_post_loop() && !cl_next->range_checks_present()) {
if (!cl_next->is_vectorized_loop()) {
int slp_max_unroll_factor = cl->slp_max_unroll();
cl_next->set_slp_max_unroll(slp_max_unroll_factor);
}
}
}
}
}
}
//------------------------------early unrolling analysis------------------------------
void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) {
bool is_slp = true;
ResourceMark rm;
size_t ignored_size = lpt()->_body.size();
int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
Node_Stack nstack((int)ignored_size);
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
Node *cl_exit = cl->loopexit_or_null();
int rpo_idx = _post_block.length();
assert(rpo_idx == 0, "post loop block is empty");
// First clear the entries
for (uint i = 0; i < lpt()->_body.size(); i++) {
ignored_loop_nodes[i] = -1;
}
int max_vector = Matcher::max_vector_size(T_BYTE);
bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
// Process the loop, some/all of the stack entries will not be in order, ergo
// need to preprocess the ignored initial state before we process the loop
for (uint i = 0; i < lpt()->_body.size(); i++) {
Node* n = lpt()->_body.at(i);
if (n == cl->incr() ||
n->is_reduction() ||
n->is_AddP() ||
n->is_Cmp() ||
n->is_IfTrue() ||
n->is_CountedLoop() ||
(n == cl_exit)) {
ignored_loop_nodes[i] = n->_idx;
continue;
}
if (n->is_If()) {
IfNode *iff = n->as_If();
if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
if (lpt()->is_loop_exit(iff)) {
ignored_loop_nodes[i] = n->_idx;
continue;
}
}
}
if (n->is_Phi() && (n->bottom_type() == Type::MEMORY)) {
Node* n_tail = n->in(LoopNode::LoopBackControl);
if (n_tail != n->in(LoopNode::EntryControl)) {
if (!n_tail->is_Mem()) {
is_slp = false;
break;
}
}
}
// This must happen after check of phi/if
if (n->is_Phi() || n->is_If()) {
ignored_loop_nodes[i] = n->_idx;
continue;
}
if (n->is_LoadStore() || n->is_MergeMem() ||
(n->is_Proj() && !n->as_Proj()->is_CFG())) {
is_slp = false;
break;
}
// Ignore nodes with non-primitive type.
BasicType bt;
if (n->is_Mem()) {
bt = n->as_Mem()->memory_type();
} else {
bt = n->bottom_type()->basic_type();
}
if (is_java_primitive(bt) == false) {
ignored_loop_nodes[i] = n->_idx;
continue;
}
if (n->is_Mem()) {
MemNode* current = n->as_Mem();
Node* adr = n->in(MemNode::Address);
Node* n_ctrl = _phase->get_ctrl(adr);
// save a queue of post process nodes
if (n_ctrl != NULL && lpt()->is_member(_phase->get_loop(n_ctrl))) {
// Process the memory expression
int stack_idx = 0;
bool have_side_effects = true;
if (adr->is_AddP() == false) {
nstack.push(adr, stack_idx++);
} else {
// Mark the components of the memory operation in nstack
SWPointer p1(current, this, &nstack, true);
have_side_effects = p1.node_stack()->is_nonempty();
}
// Process the pointer stack
while (have_side_effects) {
Node* pointer_node = nstack.node();
for (uint j = 0; j < lpt()->_body.size(); j++) {
Node* cur_node = lpt()->_body.at(j);
if (cur_node == pointer_node) {
ignored_loop_nodes[j] = cur_node->_idx;
break;
}
}
nstack.pop();
have_side_effects = nstack.is_nonempty();
}
}
}
}
if (is_slp) {
// Now we try to find the maximum supported consistent vector which the machine
// description can use
bool small_basic_type = false;
bool flag_small_bt = false;
for (uint i = 0; i < lpt()->_body.size(); i++) {
if (ignored_loop_nodes[i] != -1) continue;
BasicType bt;
Node* n = lpt()->_body.at(i);
if (n->is_Mem()) {
bt = n->as_Mem()->memory_type();
} else {
bt = n->bottom_type()->basic_type();
}
if (post_loop_allowed) {
if (!small_basic_type) {
switch (bt) {
case T_CHAR:
case T_BYTE:
case T_SHORT:
small_basic_type = true;
break;
case T_LONG:
// TODO: Remove when support completed for mask context with LONG.
// Support needs to be augmented for logical qword operations, currently we map to dword
// buckets for vectors on logicals as these were legacy.
small_basic_type = true;
break;
default:
break;
}
}
}
if (is_java_primitive(bt) == false) continue;
int cur_max_vector = Matcher::max_vector_size(bt);
// If a max vector exists which is not larger than _local_loop_unroll_factor
// stop looking, we already have the max vector to map to.
if (cur_max_vector < local_loop_unroll_factor) {
is_slp = false;
if (TraceSuperWordLoopUnrollAnalysis) {
tty->print_cr("slp analysis fails: unroll limit greater than max vector\n");
}
break;
}
// Map the maximal common vector
if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
if (cur_max_vector < max_vector && !flag_small_bt) {
max_vector = cur_max_vector;
} else if (cur_max_vector > max_vector && UseSubwordForMaxVector) {
// Analyse subword in the loop to set maximum vector size to take advantage of full vector width for subword types.
// Here we analyze if narrowing is likely to happen and if it is we set vector size more aggressively.
// We check for possibility of narrowing by looking through chain operations using subword types.
if (is_subword_type(bt)) {
uint start, end;
VectorNode::vector_operands(n, &start, &end);
for (uint j = start; j < end; j++) {
Node* in = n->in(j);
// Don't propagate through a memory
if (!in->is_Mem() && in_bb(in) && in->bottom_type()->basic_type() == T_INT) {
bool same_type = true;
for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
Node *use = in->fast_out(k);
if (!in_bb(use) && use->bottom_type()->basic_type() != bt) {
same_type = false;
break;
}
}
if (same_type) {
max_vector = cur_max_vector;
flag_small_bt = true;
cl->mark_subword_loop();
}
}
}
}
}
// We only process post loops on predicated targets where we want to
// mask map the loop to a single iteration
if (post_loop_allowed) {
_post_block.at_put_grow(rpo_idx++, n);
}
}
}
if (is_slp) {
local_loop_unroll_factor = max_vector;
cl->mark_passed_slp();
}
cl->mark_was_slp();
if (cl->is_main_loop()) {
cl->set_slp_max_unroll(local_loop_unroll_factor);
} else if (post_loop_allowed) {
if (!small_basic_type) {
// avoid replication context for small basic types in programmable masked loops
cl->set_slp_max_unroll(local_loop_unroll_factor);
}
}
}
}
//------------------------------SLP_extract---------------------------
// Extract the superword level parallelism
//
// 1) A reverse post-order of nodes in the block is constructed. By scanning
// this list from first to last, all definitions are visited before their uses.
//
// 2) A point-to-point dependence graph is constructed between memory references.
// This simplies the upcoming "independence" checker.
//
// 3) The maximum depth in the node graph from the beginning of the block
// to each node is computed. This is used to prune the graph search
// in the independence checker.
//
// 4) For integer types, the necessary bit width is propagated backwards
// from stores to allow packed operations on byte, char, and short
// integers. This reverses the promotion to type "int" that javac
// did for operations like: char c1,c2,c3; c1 = c2 + c3.
//
// 5) One of the memory references is picked to be an aligned vector reference.
// The pre-loop trip count is adjusted to align this reference in the
// unrolled body.
//
// 6) The initial set of pack pairs is seeded with memory references.
//
// 7) The set of pack pairs is extended by following use->def and def->use links.
//
// 8) The pairs are combined into vector sized packs.
//
// 9) Reorder the memory slices to co-locate members of the memory packs.
//
// 10) Generate ideal vector nodes for the final set of packs and where necessary,
// inserting scalar promotion, vector creation from multiple scalars, and
// extraction of scalar values from vectors.
//
void SuperWord::SLP_extract() {
#ifndef PRODUCT
if (_do_vector_loop && TraceSuperWord) {
tty->print("SuperWord::SLP_extract\n");
tty->print("input loop\n");
_lpt->dump_head();
_lpt->dump();
for (uint i = 0; i < _lpt->_body.size(); i++) {
_lpt->_body.at(i)->dump();
}
}
#endif
// Ready the block
if (!construct_bb()) {
return; // Exit if no interesting nodes or complex graph.
}
// build _dg, _disjoint_ptrs
dependence_graph();
// compute function depth(Node*)
compute_max_depth();
CountedLoopNode *cl = lpt()->_head->as_CountedLoop();
bool post_loop_allowed = (PostLoopMultiversioning && Matcher::has_predicated_vectors() && cl->is_post_loop());
if (cl->is_main_loop()) {
if (_do_vector_loop_experimental) {
if (mark_generations() != -1) {
hoist_loads_in_graph(); // this only rebuild the graph; all basic structs need rebuild explicitly
if (!construct_bb()) {
return; // Exit if no interesting nodes or complex graph.
}
dependence_graph();
compute_max_depth();
}
#ifndef PRODUCT
if (TraceSuperWord) {
tty->print_cr("\nSuperWord::_do_vector_loop: graph after hoist_loads_in_graph");
_lpt->dump_head();
for (int j = 0; j < _block.length(); j++) {
Node* n = _block.at(j);
int d = depth(n);
for (int i = 0; i < d; i++) tty->print("%s", " ");
tty->print("%d :", d);
n->dump();
}
}
#endif
}
compute_vector_element_type();
// Attempt vectorization
find_adjacent_refs();
if (align_to_ref() == NULL) {
return; // Did not find memory reference to align vectors
}
extend_packlist();
if (_do_vector_loop_experimental) {
if (_packset.length() == 0) {
#ifndef PRODUCT
if (TraceSuperWord) {
tty->print_cr("\nSuperWord::_do_vector_loop DFA could not build packset, now trying to build anyway");
}
#endif
pack_parallel();
}
}
combine_packs();
construct_my_pack_map();
if (UseVectorCmov) {
merge_packs_to_cmovd();
}
filter_packs();
schedule();
} else if (post_loop_allowed) {
int saved_mapped_unroll_factor = cl->slp_max_unroll();
if (saved_mapped_unroll_factor) {
int vector_mapped_unroll_factor = saved_mapped_unroll_factor;
// now reset the slp_unroll_factor so that we can check the analysis mapped
// what the vector loop was mapped to
cl->set_slp_max_unroll(0);
// do the analysis on the post loop
unrolling_analysis(vector_mapped_unroll_factor);
// if our analyzed loop is a canonical fit, start processing it
if (vector_mapped_unroll_factor == saved_mapped_unroll_factor) {
// now add the vector nodes to packsets
for (int i = 0; i < _post_block.length(); i++) {
Node* n = _post_block.at(i);
Node_List* singleton = new Node_List();
singleton->push(n);
_packset.append(singleton);
set_my_pack(n, singleton);
}
// map base types for vector usage
compute_vector_element_type();
} else {
return;
}
} else {
// for some reason we could not map the slp analysis state of the vectorized loop
return;
}
}
output();
}
//------------------------------find_adjacent_refs---------------------------
// Find the adjacent memory references and create pack pairs for them.
// This is the initial set of packs that will then be extended by
// following use->def and def->use links. The align positions are
// assigned relative to the reference "align_to_ref"
void SuperWord::find_adjacent_refs() {
// Get list of memory operations
Node_List memops;
for (int i = 0; i < _block.length(); i++) {
Node* n = _block.at(i);
if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) &&
is_java_primitive(n->as_Mem()->memory_type())) {
int align = memory_alignment(n->as_Mem(), 0);
if (align != bottom_align) {
memops.push(n);
}
}
}
if (TraceSuperWord) {
tty->print_cr("\nfind_adjacent_refs found %d memops", memops.size());
}
Node_List align_to_refs;
int max_idx;
int best_iv_adjustment = 0;
MemNode* best_align_to_mem_ref = NULL;
while (memops.size() != 0) {
// Find a memory reference to align to.
MemNode* mem_ref = find_align_to_ref(memops, max_idx);
if (mem_ref == NULL) break;
align_to_refs.push(mem_ref);
int iv_adjustment = get_iv_adjustment(mem_ref);
if (best_align_to_mem_ref == NULL) {
// Set memory reference which is the best from all memory operations
// to be used for alignment. The pre-loop trip count is modified to align
// this reference to a vector-aligned address.
best_align_to_mem_ref = mem_ref;
best_iv_adjustment = iv_adjustment;
NOT_PRODUCT(find_adjacent_refs_trace_1(best_align_to_mem_ref, best_iv_adjustment);)
}
SWPointer align_to_ref_p(mem_ref, this, NULL, false);
// Set alignment relative to "align_to_ref" for all related memory operations.
for (int i = memops.size() - 1; i >= 0; i--) {
MemNode* s = memops.at(i)->as_Mem();
if (isomorphic(s, mem_ref) &&
(!_do_vector_loop || same_origin_idx(s, mem_ref))) {
SWPointer p2(s, this, NULL, false);
if (p2.comparable(align_to_ref_p)) {
int align = memory_alignment(s, iv_adjustment);
set_alignment(s, align);
}
}
}
// Create initial pack pairs of memory operations for which
// alignment is set and vectors will be aligned.
bool create_pack = true;
if (memory_alignment(mem_ref, best_iv_adjustment) == 0 || _do_vector_loop) {
if (vectors_should_be_aligned()) {
int vw = vector_width(mem_ref);
int vw_best = vector_width(best_align_to_mem_ref);
if (vw > vw_best) {
// Do not vectorize a memory access with more elements per vector
// if unaligned memory access is not allowed because number of
// iterations in pre-loop will be not enough to align it.
create_pack = false;
} else {
SWPointer p2(best_align_to_mem_ref, this, NULL, false);
if (!align_to_ref_p.invar_equals(p2)) {
// Do not vectorize memory accesses with different invariants
// if unaligned memory accesses are not allowed.
create_pack = false;
}
}
}
} else {
if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
// Can't allow vectorization of unaligned memory accesses with the
// same type since it could be overlapped accesses to the same array.
create_pack = false;
} else {
// Allow independent (different type) unaligned memory operations
// if HW supports them.
if (vectors_should_be_aligned()) {
create_pack = false;
} else {
// Check if packs of the same memory type but
// with a different alignment were created before.
for (uint i = 0; i < align_to_refs.size(); i++) {
MemNode* mr = align_to_refs.at(i)->as_Mem();
if (mr == mem_ref) {
// Skip when we are looking at same memory operation.
continue;
}
if (same_velt_type(mr, mem_ref) &&
memory_alignment(mr, iv_adjustment) != 0)
create_pack = false;
}
}
}
}
if (create_pack) {
for (uint i = 0; i < memops.size(); i++) {
Node* s1 = memops.at(i);
int align = alignment(s1);
if (align == top_align) continue;
for (uint j = 0; j < memops.size(); j++) {
Node* s2 = memops.at(j);
if (alignment(s2) == top_align) continue;
if (s1 != s2 && are_adjacent_refs(s1, s2)) {
if (stmts_can_pack(s1, s2, align)) {
Node_List* pair = new Node_List();
pair->push(s1);
pair->push(s2);
if (!_do_vector_loop || same_origin_idx(s1, s2)) {
_packset.append(pair);
}
}
}
}
}
} else { // Don't create unaligned pack
// First, remove remaining memory ops of the same type from the list.
for (int i = memops.size() - 1; i >= 0; i--) {
MemNode* s = memops.at(i)->as_Mem();
if (same_velt_type(s, mem_ref)) {
memops.remove(i);
}
}
// Second, remove already constructed packs of the same type.
for (int i = _packset.length() - 1; i >= 0; i--) {
Node_List* p = _packset.at(i);
MemNode* s = p->at(0)->as_Mem();
if (same_velt_type(s, mem_ref)) {
remove_pack_at(i);
}
}
// If needed find the best memory reference for loop alignment again.
if (same_velt_type(mem_ref, best_align_to_mem_ref)) {
// Put memory ops from remaining packs back on memops list for
// the best alignment search.
uint orig_msize = memops.size();
for (int i = 0; i < _packset.length(); i++) {
Node_List* p = _packset.at(i);
MemNode* s = p->at(0)->as_Mem();
assert(!same_velt_type(s, mem_ref), "sanity");
memops.push(s);
}
best_align_to_mem_ref = find_align_to_ref(memops, max_idx);
if (best_align_to_mem_ref == NULL) {
if (TraceSuperWord) {
tty->print_cr("SuperWord::find_adjacent_refs(): best_align_to_mem_ref == NULL");
}
// best_align_to_mem_ref will be used for adjusting the pre-loop limit in
// SuperWord::align_initial_loop_index. Find one with the biggest vector size,
// smallest data size and smallest iv offset from memory ops from remaining packs.
if (_packset.length() > 0) {
if (orig_msize == 0) {
best_align_to_mem_ref = memops.at(max_idx)->as_Mem();
} else {
for (uint i = 0; i < orig_msize; i++) {
memops.remove(0);
}
best_align_to_mem_ref = find_align_to_ref(memops, max_idx);
assert(best_align_to_mem_ref == NULL, "sanity");
best_align_to_mem_ref = memops.at(max_idx)->as_Mem();
}
assert(best_align_to_mem_ref != NULL, "sanity");
}
break;
}
best_iv_adjustment = get_iv_adjustment(best_align_to_mem_ref);
NOT_PRODUCT(find_adjacent_refs_trace_1(best_align_to_mem_ref, best_iv_adjustment);)
// Restore list.
while (memops.size() > orig_msize)
(void)memops.pop();
}
} // unaligned memory accesses
// Remove used mem nodes.
for (int i = memops.size() - 1; i >= 0; i--) {
MemNode* m = memops.at(i)->as_Mem();
if (alignment(m) != top_align) {
memops.remove(i);
}
}
} // while (memops.size() != 0
set_align_to_ref(best_align_to_mem_ref);
if (TraceSuperWord) {
tty->print_cr("\nAfter find_adjacent_refs");
print_packset();
}
}
#ifndef PRODUCT
void SuperWord::find_adjacent_refs_trace_1(Node* best_align_to_mem_ref, int best_iv_adjustment) {
if (is_trace_adjacent()) {
tty->print("SuperWord::find_adjacent_refs best_align_to_mem_ref = %d, best_iv_adjustment = %d",
best_align_to_mem_ref->_idx, best_iv_adjustment);
best_align_to_mem_ref->dump();
}
}
#endif
//------------------------------find_align_to_ref---------------------------
// Find a memory reference to align the loop induction variable to.
// Looks first at stores then at loads, looking for a memory reference
// with the largest number of references similar to it.
MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) {
GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0);
// Count number of comparable memory ops
for (uint i = 0; i < memops.size(); i++) {
MemNode* s1 = memops.at(i)->as_Mem();
SWPointer p1(s1, this, NULL, false);
// Only discard unalignable memory references if vector memory references
// should be aligned on this platform.
if (vectors_should_be_aligned() && !ref_is_alignable(p1)) {
*cmp_ct.adr_at(i) = 0;
continue;
}
for (uint j = i+1; j < memops.size(); j++) {
MemNode* s2 = memops.at(j)->as_Mem();
if (isomorphic(s1, s2)) {
SWPointer p2(s2, this, NULL, false);
if (p1.comparable(p2)) {
(*cmp_ct.adr_at(i))++;
(*cmp_ct.adr_at(j))++;
}
}
}
}
// Find Store (or Load) with the greatest number of "comparable" references,
// biggest vector size, smallest data size and smallest iv offset.
int max_ct = 0;
int max_vw = 0;
int max_idx = -1;
int min_size = max_jint;
int min_iv_offset = max_jint;
for (uint j = 0; j < memops.size(); j++) {
MemNode* s = memops.at(j)->as_Mem();
if (s->is_Store()) {
int vw = vector_width_in_bytes(s);
assert(vw > 1, "sanity");
SWPointer p(s, this, NULL, false);
if ( cmp_ct.at(j) > max_ct ||
(cmp_ct.at(j) == max_ct &&
( vw > max_vw ||
(vw == max_vw &&
( data_size(s) < min_size ||
(data_size(s) == min_size &&
p.offset_in_bytes() < min_iv_offset)))))) {
max_ct = cmp_ct.at(j);
max_vw = vw;
max_idx = j;
min_size = data_size(s);
min_iv_offset = p.offset_in_bytes();
}
}
}
// If no stores, look at loads
if (max_ct == 0) {
for (uint j = 0; j < memops.size(); j++) {
MemNode* s = memops.at(j)->as_Mem();
if (s->is_Load()) {
int vw = vector_width_in_bytes(s);
assert(vw > 1, "sanity");
SWPointer p(s, this, NULL, false);
if ( cmp_ct.at(j) > max_ct ||
(cmp_ct.at(j) == max_ct &&
( vw > max_vw ||
(vw == max_vw &&
( data_size(s) < min_size ||
(data_size(s) == min_size &&
p.offset_in_bytes() < min_iv_offset)))))) {
max_ct = cmp_ct.at(j);
max_vw = vw;
max_idx = j;
min_size = data_size(s);
min_iv_offset = p.offset_in_bytes();
}
}
}
}
#ifdef ASSERT
if (TraceSuperWord && Verbose) {
tty->print_cr("\nVector memops after find_align_to_ref");
for (uint i = 0; i < memops.size(); i++) {
MemNode* s = memops.at(i)->as_Mem();
s->dump();
}
}
#endif
idx = max_idx;
if (max_ct > 0) {
#ifdef ASSERT
if (TraceSuperWord) {
tty->print("\nVector align to node: ");
memops.at(max_idx)->as_Mem()->dump();
}
#endif
return memops.at(max_idx)->as_Mem();
}
return NULL;
}
//------------------span_works_for_memory_size-----------------------------
static bool span_works_for_memory_size(MemNode* mem, int span, int mem_size, int offset) {
bool span_matches_memory = false;
if ((mem_size == type2aelembytes(T_BYTE) || mem_size == type2aelembytes(T_SHORT))
&& ABS(span) == type2aelembytes(T_INT)) {
// There is a mismatch on span size compared to memory.
for (DUIterator_Fast jmax, j = mem->fast_outs(jmax); j < jmax; j++) {
Node* use = mem->fast_out(j);
if (!VectorNode::is_type_transition_to_int(use)) {
return false;
}
}
// If all uses transition to integer, it means that we can successfully align even on mismatch.
return true;
}
else {
span_matches_memory = ABS(span) == mem_size;
}
return span_matches_memory && (ABS(offset) % mem_size) == 0;
}
//------------------------------ref_is_alignable---------------------------
// Can the preloop align the reference to position zero in the vector?
bool SuperWord::ref_is_alignable(SWPointer& p) {
if (!p.has_iv()) {
return true; // no induction variable
}
CountedLoopEndNode* pre_end = pre_loop_end();
assert(pre_end->stride_is_con(), "pre loop stride is constant");
int preloop_stride = pre_end->stride_con();
int span = preloop_stride * p.scale_in_bytes();
int mem_size = p.memory_size();
int offset = p.offset_in_bytes();
// Stride one accesses are alignable if offset is aligned to memory operation size.
// Offset can be unaligned when UseUnalignedAccesses is used.
if (span_works_for_memory_size(p.mem(), span, mem_size, offset)) {
return true;
}
// If the initial offset from start of the object is computable,
// check if the pre-loop can align the final offset accordingly.
//
// In other words: Can we find an i such that the offset
// after i pre-loop iterations is aligned to vw?
// (init_offset + pre_loop) % vw == 0 (1)
// where
// pre_loop = i * span
// is the number of bytes added to the offset by i pre-loop iterations.
//
// For this to hold we need pre_loop to increase init_offset by
// pre_loop = vw - (init_offset % vw)
//
// This is only possible if pre_loop is divisible by span because each
// pre-loop iteration increases the initial offset by 'span' bytes:
// (vw - (init_offset % vw)) % span == 0
//
int vw = vector_width_in_bytes(p.mem());
assert(vw > 1, "sanity");
Node* init_nd = pre_end->init_trip();
if (init_nd->is_Con() && p.invar() == NULL) {
int init = init_nd->bottom_type()->is_int()->get_con();
int init_offset = init * p.scale_in_bytes() + offset;
if (init_offset < 0) { // negative offset from object start?
return false; // may happen in dead loop
}
if (vw % span == 0) {
// If vm is a multiple of span, we use formula (1).
if (span > 0) {
return (vw - (init_offset % vw)) % span == 0;
} else {
assert(span < 0, "nonzero stride * scale");
return (init_offset % vw) % -span == 0;
}
} else if (span % vw == 0) {
// If span is a multiple of vw, we can simplify formula (1) to:
// (init_offset + i * span) % vw == 0
// =>
// (init_offset % vw) + ((i * span) % vw) == 0
// =>
// init_offset % vw == 0
//
// Because we add a multiple of vw to the initial offset, the final
// offset is a multiple of vw if and only if init_offset is a multiple.
//
return (init_offset % vw) == 0;
}
}
return false;
}
//---------------------------get_vw_bytes_special------------------------
int SuperWord::get_vw_bytes_special(MemNode* s) {
// Get the vector width in bytes.
int vw = vector_width_in_bytes(s);
// Check for special case where there is an MulAddS2I usage where short vectors are going to need combined.
BasicType btype = velt_basic_type(s);
if (type2aelembytes(btype) == 2) {
bool should_combine_adjacent = true;
for (DUIterator_Fast imax, i = s->fast_outs(imax); i < imax; i++) {
Node* user = s->fast_out(i);
if (!VectorNode::is_muladds2i(user)) {
should_combine_adjacent = false;
}
}
if (should_combine_adjacent) {
vw = MIN2(Matcher::max_vector_size(btype)*type2aelembytes(btype), vw * 2);
}
}