-
Notifications
You must be signed in to change notification settings - Fork 5.4k
/
stubGenerator_aarch64.cpp
7786 lines (6664 loc) · 268 KB
/
stubGenerator_aarch64.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#include "precompiled.hpp"
#include "asm/macroAssembler.hpp"
#include "asm/macroAssembler.inline.hpp"
#include "atomic_aarch64.hpp"
#include "compiler/oopMap.hpp"
#include "gc/shared/barrierSet.hpp"
#include "gc/shared/barrierSetAssembler.hpp"
#include "gc/shared/gc_globals.hpp"
#include "gc/shared/tlab_globals.hpp"
#include "interpreter/interpreter.hpp"
#include "memory/universe.hpp"
#include "nativeInst_aarch64.hpp"
#include "oops/instanceOop.hpp"
#include "oops/method.hpp"
#include "oops/objArrayKlass.hpp"
#include "oops/oop.inline.hpp"
#include "prims/methodHandles.hpp"
#include "runtime/atomic.hpp"
#include "runtime/frame.inline.hpp"
#include "runtime/handles.inline.hpp"
#include "runtime/sharedRuntime.hpp"
#include "runtime/stubCodeGenerator.hpp"
#include "runtime/stubRoutines.hpp"
#include "runtime/thread.inline.hpp"
#include "utilities/align.hpp"
#include "utilities/powerOfTwo.hpp"
#ifdef COMPILER2
#include "opto/runtime.hpp"
#endif
#if INCLUDE_ZGC
#include "gc/z/zThreadLocalData.hpp"
#endif
// Declaration and definition of StubGenerator (no .hpp file).
// For a more detailed description of the stub routine structure
// see the comment in stubRoutines.hpp
#undef __
#define __ _masm->
#define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
#ifdef PRODUCT
#define BLOCK_COMMENT(str) /* nothing */
#else
#define BLOCK_COMMENT(str) __ block_comment(str)
#endif
#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
// Stub Code definitions
class StubGenerator: public StubCodeGenerator {
private:
#ifdef PRODUCT
#define inc_counter_np(counter) ((void)0)
#else
void inc_counter_np_(int& counter) {
__ lea(rscratch2, ExternalAddress((address)&counter));
__ ldrw(rscratch1, Address(rscratch2));
__ addw(rscratch1, rscratch1, 1);
__ strw(rscratch1, Address(rscratch2));
}
#define inc_counter_np(counter) \
BLOCK_COMMENT("inc_counter " #counter); \
inc_counter_np_(counter);
#endif
// Call stubs are used to call Java from C
//
// Arguments:
// c_rarg0: call wrapper address address
// c_rarg1: result address
// c_rarg2: result type BasicType
// c_rarg3: method Method*
// c_rarg4: (interpreter) entry point address
// c_rarg5: parameters intptr_t*
// c_rarg6: parameter size (in words) int
// c_rarg7: thread Thread*
//
// There is no return from the stub itself as any Java result
// is written to result
//
// we save r30 (lr) as the return PC at the base of the frame and
// link r29 (fp) below it as the frame pointer installing sp (r31)
// into fp.
//
// we save r0-r7, which accounts for all the c arguments.
//
// TODO: strictly do we need to save them all? they are treated as
// volatile by C so could we omit saving the ones we are going to
// place in global registers (thread? method?) or those we only use
// during setup of the Java call?
//
// we don't need to save r8 which C uses as an indirect result location
// return register.
//
// we don't need to save r9-r15 which both C and Java treat as
// volatile
//
// we don't need to save r16-18 because Java does not use them
//
// we save r19-r28 which Java uses as scratch registers and C
// expects to be callee-save
//
// we save the bottom 64 bits of each value stored in v8-v15; it is
// the responsibility of the caller to preserve larger values.
//
// so the stub frame looks like this when we enter Java code
//
// [ return_from_Java ] <--- sp
// [ argument word n ]
// ...
// -27 [ argument word 1 ]
// -26 [ saved v15 ] <--- sp_after_call
// -25 [ saved v14 ]
// -24 [ saved v13 ]
// -23 [ saved v12 ]
// -22 [ saved v11 ]
// -21 [ saved v10 ]
// -20 [ saved v9 ]
// -19 [ saved v8 ]
// -18 [ saved r28 ]
// -17 [ saved r27 ]
// -16 [ saved r26 ]
// -15 [ saved r25 ]
// -14 [ saved r24 ]
// -13 [ saved r23 ]
// -12 [ saved r22 ]
// -11 [ saved r21 ]
// -10 [ saved r20 ]
// -9 [ saved r19 ]
// -8 [ call wrapper (r0) ]
// -7 [ result (r1) ]
// -6 [ result type (r2) ]
// -5 [ method (r3) ]
// -4 [ entry point (r4) ]
// -3 [ parameters (r5) ]
// -2 [ parameter size (r6) ]
// -1 [ thread (r7) ]
// 0 [ saved fp (r29) ] <--- fp == saved sp (r31)
// 1 [ saved lr (r30) ]
// Call stub stack layout word offsets from fp
enum call_stub_layout {
sp_after_call_off = -26,
d15_off = -26,
d13_off = -24,
d11_off = -22,
d9_off = -20,
r28_off = -18,
r26_off = -16,
r24_off = -14,
r22_off = -12,
r20_off = -10,
call_wrapper_off = -8,
result_off = -7,
result_type_off = -6,
method_off = -5,
entry_point_off = -4,
parameter_size_off = -2,
thread_off = -1,
fp_f = 0,
retaddr_off = 1,
};
address generate_call_stub(address& return_address) {
assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
(int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
"adjust this code");
StubCodeMark mark(this, "StubRoutines", "call_stub");
address start = __ pc();
const Address sp_after_call(rfp, sp_after_call_off * wordSize);
const Address call_wrapper (rfp, call_wrapper_off * wordSize);
const Address result (rfp, result_off * wordSize);
const Address result_type (rfp, result_type_off * wordSize);
const Address method (rfp, method_off * wordSize);
const Address entry_point (rfp, entry_point_off * wordSize);
const Address parameter_size(rfp, parameter_size_off * wordSize);
const Address thread (rfp, thread_off * wordSize);
const Address d15_save (rfp, d15_off * wordSize);
const Address d13_save (rfp, d13_off * wordSize);
const Address d11_save (rfp, d11_off * wordSize);
const Address d9_save (rfp, d9_off * wordSize);
const Address r28_save (rfp, r28_off * wordSize);
const Address r26_save (rfp, r26_off * wordSize);
const Address r24_save (rfp, r24_off * wordSize);
const Address r22_save (rfp, r22_off * wordSize);
const Address r20_save (rfp, r20_off * wordSize);
// stub code
address aarch64_entry = __ pc();
// set up frame and move sp to end of save area
__ enter();
__ sub(sp, rfp, -sp_after_call_off * wordSize);
// save register parameters and Java scratch/global registers
// n.b. we save thread even though it gets installed in
// rthread because we want to sanity check rthread later
__ str(c_rarg7, thread);
__ strw(c_rarg6, parameter_size);
__ stp(c_rarg4, c_rarg5, entry_point);
__ stp(c_rarg2, c_rarg3, result_type);
__ stp(c_rarg0, c_rarg1, call_wrapper);
__ stp(r20, r19, r20_save);
__ stp(r22, r21, r22_save);
__ stp(r24, r23, r24_save);
__ stp(r26, r25, r26_save);
__ stp(r28, r27, r28_save);
__ stpd(v9, v8, d9_save);
__ stpd(v11, v10, d11_save);
__ stpd(v13, v12, d13_save);
__ stpd(v15, v14, d15_save);
// install Java thread in global register now we have saved
// whatever value it held
__ mov(rthread, c_rarg7);
// And method
__ mov(rmethod, c_rarg3);
// set up the heapbase register
__ reinit_heapbase();
#ifdef ASSERT
// make sure we have no pending exceptions
{
Label L;
__ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
__ cmp(rscratch1, (u1)NULL_WORD);
__ br(Assembler::EQ, L);
__ stop("StubRoutines::call_stub: entered with pending exception");
__ BIND(L);
}
#endif
// pass parameters if any
__ mov(esp, sp);
__ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
__ andr(sp, rscratch1, -2 * wordSize);
BLOCK_COMMENT("pass parameters if any");
Label parameters_done;
// parameter count is still in c_rarg6
// and parameter pointer identifying param 1 is in c_rarg5
__ cbzw(c_rarg6, parameters_done);
address loop = __ pc();
__ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
__ subsw(c_rarg6, c_rarg6, 1);
__ push(rscratch1);
__ br(Assembler::GT, loop);
__ BIND(parameters_done);
// call Java entry -- passing methdoOop, and current sp
// rmethod: Method*
// r13: sender sp
BLOCK_COMMENT("call Java function");
__ mov(r13, sp);
__ blr(c_rarg4);
// we do this here because the notify will already have been done
// if we get to the next instruction via an exception
//
// n.b. adding this instruction here affects the calculation of
// whether or not a routine returns to the call stub (used when
// doing stack walks) since the normal test is to check the return
// pc against the address saved below. so we may need to allow for
// this extra instruction in the check.
// save current address for use by exception handling code
return_address = __ pc();
// store result depending on type (everything that is not
// T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
// n.b. this assumes Java returns an integral result in r0
// and a floating result in j_farg0
__ ldr(j_rarg2, result);
Label is_long, is_float, is_double, exit;
__ ldr(j_rarg1, result_type);
__ cmp(j_rarg1, (u1)T_OBJECT);
__ br(Assembler::EQ, is_long);
__ cmp(j_rarg1, (u1)T_LONG);
__ br(Assembler::EQ, is_long);
__ cmp(j_rarg1, (u1)T_FLOAT);
__ br(Assembler::EQ, is_float);
__ cmp(j_rarg1, (u1)T_DOUBLE);
__ br(Assembler::EQ, is_double);
// handle T_INT case
__ strw(r0, Address(j_rarg2));
__ BIND(exit);
// pop parameters
__ sub(esp, rfp, -sp_after_call_off * wordSize);
#ifdef ASSERT
// verify that threads correspond
{
Label L, S;
__ ldr(rscratch1, thread);
__ cmp(rthread, rscratch1);
__ br(Assembler::NE, S);
__ get_thread(rscratch1);
__ cmp(rthread, rscratch1);
__ br(Assembler::EQ, L);
__ BIND(S);
__ stop("StubRoutines::call_stub: threads must correspond");
__ BIND(L);
}
#endif
// restore callee-save registers
__ ldpd(v15, v14, d15_save);
__ ldpd(v13, v12, d13_save);
__ ldpd(v11, v10, d11_save);
__ ldpd(v9, v8, d9_save);
__ ldp(r28, r27, r28_save);
__ ldp(r26, r25, r26_save);
__ ldp(r24, r23, r24_save);
__ ldp(r22, r21, r22_save);
__ ldp(r20, r19, r20_save);
__ ldp(c_rarg0, c_rarg1, call_wrapper);
__ ldrw(c_rarg2, result_type);
__ ldr(c_rarg3, method);
__ ldp(c_rarg4, c_rarg5, entry_point);
__ ldp(c_rarg6, c_rarg7, parameter_size);
// leave frame and return to caller
__ leave();
__ ret(lr);
// handle return types different from T_INT
__ BIND(is_long);
__ str(r0, Address(j_rarg2, 0));
__ br(Assembler::AL, exit);
__ BIND(is_float);
__ strs(j_farg0, Address(j_rarg2, 0));
__ br(Assembler::AL, exit);
__ BIND(is_double);
__ strd(j_farg0, Address(j_rarg2, 0));
__ br(Assembler::AL, exit);
return start;
}
// Return point for a Java call if there's an exception thrown in
// Java code. The exception is caught and transformed into a
// pending exception stored in JavaThread that can be tested from
// within the VM.
//
// Note: Usually the parameters are removed by the callee. In case
// of an exception crossing an activation frame boundary, that is
// not the case if the callee is compiled code => need to setup the
// rsp.
//
// r0: exception oop
address generate_catch_exception() {
StubCodeMark mark(this, "StubRoutines", "catch_exception");
address start = __ pc();
// same as in generate_call_stub():
const Address sp_after_call(rfp, sp_after_call_off * wordSize);
const Address thread (rfp, thread_off * wordSize);
#ifdef ASSERT
// verify that threads correspond
{
Label L, S;
__ ldr(rscratch1, thread);
__ cmp(rthread, rscratch1);
__ br(Assembler::NE, S);
__ get_thread(rscratch1);
__ cmp(rthread, rscratch1);
__ br(Assembler::EQ, L);
__ bind(S);
__ stop("StubRoutines::catch_exception: threads must correspond");
__ bind(L);
}
#endif
// set pending exception
__ verify_oop(r0);
__ str(r0, Address(rthread, Thread::pending_exception_offset()));
__ mov(rscratch1, (address)__FILE__);
__ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
__ movw(rscratch1, (int)__LINE__);
__ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
// complete return to VM
assert(StubRoutines::_call_stub_return_address != NULL,
"_call_stub_return_address must have been generated before");
__ b(StubRoutines::_call_stub_return_address);
return start;
}
// Continuation point for runtime calls returning with a pending
// exception. The pending exception check happened in the runtime
// or native call stub. The pending exception in Thread is
// converted into a Java-level exception.
//
// Contract with Java-level exception handlers:
// r0: exception
// r3: throwing pc
//
// NOTE: At entry of this stub, exception-pc must be in LR !!
// NOTE: this is always used as a jump target within generated code
// so it just needs to be generated code wiht no x86 prolog
address generate_forward_exception() {
StubCodeMark mark(this, "StubRoutines", "forward exception");
address start = __ pc();
// Upon entry, LR points to the return address returning into
// Java (interpreted or compiled) code; i.e., the return address
// becomes the throwing pc.
//
// Arguments pushed before the runtime call are still on the stack
// but the exception handler will reset the stack pointer ->
// ignore them. A potential result in registers can be ignored as
// well.
#ifdef ASSERT
// make sure this code is only executed if there is a pending exception
{
Label L;
__ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
__ cbnz(rscratch1, L);
__ stop("StubRoutines::forward exception: no pending exception (1)");
__ bind(L);
}
#endif
// compute exception handler into r19
// call the VM to find the handler address associated with the
// caller address. pass thread in r0 and caller pc (ret address)
// in r1. n.b. the caller pc is in lr, unlike x86 where it is on
// the stack.
__ mov(c_rarg1, lr);
// lr will be trashed by the VM call so we move it to R19
// (callee-saved) because we also need to pass it to the handler
// returned by this call.
__ mov(r19, lr);
BLOCK_COMMENT("call exception_handler_for_return_address");
__ call_VM_leaf(CAST_FROM_FN_PTR(address,
SharedRuntime::exception_handler_for_return_address),
rthread, c_rarg1);
// Reinitialize the ptrue predicate register, in case the external runtime
// call clobbers ptrue reg, as we may return to SVE compiled code.
__ reinitialize_ptrue();
// we should not really care that lr is no longer the callee
// address. we saved the value the handler needs in r19 so we can
// just copy it to r3. however, the C2 handler will push its own
// frame and then calls into the VM and the VM code asserts that
// the PC for the frame above the handler belongs to a compiled
// Java method. So, we restore lr here to satisfy that assert.
__ mov(lr, r19);
// setup r0 & r3 & clear pending exception
__ mov(r3, r19);
__ mov(r19, r0);
__ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
__ str(zr, Address(rthread, Thread::pending_exception_offset()));
#ifdef ASSERT
// make sure exception is set
{
Label L;
__ cbnz(r0, L);
__ stop("StubRoutines::forward exception: no pending exception (2)");
__ bind(L);
}
#endif
// continue at exception handler
// r0: exception
// r3: throwing pc
// r19: exception handler
__ verify_oop(r0);
__ br(r19);
return start;
}
// Non-destructive plausibility checks for oops
//
// Arguments:
// r0: oop to verify
// rscratch1: error message
//
// Stack after saving c_rarg3:
// [tos + 0]: saved c_rarg3
// [tos + 1]: saved c_rarg2
// [tos + 2]: saved lr
// [tos + 3]: saved rscratch2
// [tos + 4]: saved r0
// [tos + 5]: saved rscratch1
address generate_verify_oop() {
StubCodeMark mark(this, "StubRoutines", "verify_oop");
address start = __ pc();
Label exit, error;
// save c_rarg2 and c_rarg3
__ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
// __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
__ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
__ ldr(c_rarg3, Address(c_rarg2));
__ add(c_rarg3, c_rarg3, 1);
__ str(c_rarg3, Address(c_rarg2));
// object is in r0
// make sure object is 'reasonable'
__ cbz(r0, exit); // if obj is NULL it is OK
#if INCLUDE_ZGC
if (UseZGC) {
// Check if mask is good.
// verifies that ZAddressBadMask & r0 == 0
__ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
__ andr(c_rarg2, r0, c_rarg3);
__ cbnz(c_rarg2, error);
}
#endif
// Check if the oop is in the right area of memory
__ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
__ andr(c_rarg2, r0, c_rarg3);
__ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
// Compare c_rarg2 and c_rarg3. We don't use a compare
// instruction here because the flags register is live.
__ eor(c_rarg2, c_rarg2, c_rarg3);
__ cbnz(c_rarg2, error);
// make sure klass is 'reasonable', which is not zero.
__ load_klass(r0, r0); // get klass
__ cbz(r0, error); // if klass is NULL it is broken
// return if everything seems ok
__ bind(exit);
__ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
__ ret(lr);
// handle errors
__ bind(error);
__ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
__ push(RegSet::range(r0, r29), sp);
// debug(char* msg, int64_t pc, int64_t regs[])
__ mov(c_rarg0, rscratch1); // pass address of error message
__ mov(c_rarg1, lr); // pass return address
__ mov(c_rarg2, sp); // pass address of regs on stack
#ifndef PRODUCT
assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
#endif
BLOCK_COMMENT("call MacroAssembler::debug");
__ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
__ blr(rscratch1);
__ hlt(0);
return start;
}
void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
// Generate indices for iota vector.
address generate_iota_indices(const char *stub_name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", stub_name);
address start = __ pc();
__ emit_data64(0x0706050403020100, relocInfo::none);
__ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
return start;
}
// The inner part of zero_words(). This is the bulk operation,
// zeroing words in blocks, possibly using DC ZVA to do it. The
// caller is responsible for zeroing the last few words.
//
// Inputs:
// r10: the HeapWord-aligned base address of an array to zero.
// r11: the count in HeapWords, r11 > 0.
//
// Returns r10 and r11, adjusted for the caller to clear.
// r10: the base address of the tail of words left to clear.
// r11: the number of words in the tail.
// r11 < MacroAssembler::zero_words_block_size.
address generate_zero_blocks() {
Label done;
Label base_aligned;
Register base = r10, cnt = r11;
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "zero_blocks");
address start = __ pc();
if (UseBlockZeroing) {
int zva_length = VM_Version::zva_length();
// Ensure ZVA length can be divided by 16. This is required by
// the subsequent operations.
assert (zva_length % 16 == 0, "Unexpected ZVA Length");
__ tbz(base, 3, base_aligned);
__ str(zr, Address(__ post(base, 8)));
__ sub(cnt, cnt, 1);
__ bind(base_aligned);
// Ensure count >= zva_length * 2 so that it still deserves a zva after
// alignment.
Label small;
int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
__ subs(rscratch1, cnt, low_limit >> 3);
__ br(Assembler::LT, small);
__ zero_dcache_blocks(base, cnt);
__ bind(small);
}
{
// Number of stp instructions we'll unroll
const int unroll =
MacroAssembler::zero_words_block_size / 2;
// Clear the remaining blocks.
Label loop;
__ subs(cnt, cnt, unroll * 2);
__ br(Assembler::LT, done);
__ bind(loop);
for (int i = 0; i < unroll; i++)
__ stp(zr, zr, __ post(base, 16));
__ subs(cnt, cnt, unroll * 2);
__ br(Assembler::GE, loop);
__ bind(done);
__ add(cnt, cnt, unroll * 2);
}
__ ret(lr);
return start;
}
typedef enum {
copy_forwards = 1,
copy_backwards = -1
} copy_direction;
// Bulk copy of blocks of 8 words.
//
// count is a count of words.
//
// Precondition: count >= 8
//
// Postconditions:
//
// The least significant bit of count contains the remaining count
// of words to copy. The rest of count is trash.
//
// s and d are adjusted to point to the remaining words to copy
//
void generate_copy_longs(Label &start, Register s, Register d, Register count,
copy_direction direction) {
int unit = wordSize * direction;
int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
t4 = r7, t5 = r10, t6 = r11, t7 = r12;
const Register stride = r13;
assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
assert_different_registers(s, d, count, rscratch1);
Label again, drain;
const char *stub_name;
if (direction == copy_forwards)
stub_name = "forward_copy_longs";
else
stub_name = "backward_copy_longs";
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", stub_name);
__ bind(start);
Label unaligned_copy_long;
if (AvoidUnalignedAccesses) {
__ tbnz(d, 3, unaligned_copy_long);
}
if (direction == copy_forwards) {
__ sub(s, s, bias);
__ sub(d, d, bias);
}
#ifdef ASSERT
// Make sure we are never given < 8 words
{
Label L;
__ cmp(count, (u1)8);
__ br(Assembler::GE, L);
__ stop("genrate_copy_longs called with < 8 words");
__ bind(L);
}
#endif
// Fill 8 registers
if (UseSIMDForMemoryOps) {
__ ldpq(v0, v1, Address(s, 4 * unit));
__ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
} else {
__ ldp(t0, t1, Address(s, 2 * unit));
__ ldp(t2, t3, Address(s, 4 * unit));
__ ldp(t4, t5, Address(s, 6 * unit));
__ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
}
__ subs(count, count, 16);
__ br(Assembler::LO, drain);
int prefetch = PrefetchCopyIntervalInBytes;
bool use_stride = false;
if (direction == copy_backwards) {
use_stride = prefetch > 256;
prefetch = -prefetch;
if (use_stride) __ mov(stride, prefetch);
}
__ bind(again);
if (PrefetchCopyIntervalInBytes > 0)
__ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
if (UseSIMDForMemoryOps) {
__ stpq(v0, v1, Address(d, 4 * unit));
__ ldpq(v0, v1, Address(s, 4 * unit));
__ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
__ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
} else {
__ stp(t0, t1, Address(d, 2 * unit));
__ ldp(t0, t1, Address(s, 2 * unit));
__ stp(t2, t3, Address(d, 4 * unit));
__ ldp(t2, t3, Address(s, 4 * unit));
__ stp(t4, t5, Address(d, 6 * unit));
__ ldp(t4, t5, Address(s, 6 * unit));
__ stp(t6, t7, Address(__ pre(d, 8 * unit)));
__ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
}
__ subs(count, count, 8);
__ br(Assembler::HS, again);
// Drain
__ bind(drain);
if (UseSIMDForMemoryOps) {
__ stpq(v0, v1, Address(d, 4 * unit));
__ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
} else {
__ stp(t0, t1, Address(d, 2 * unit));
__ stp(t2, t3, Address(d, 4 * unit));
__ stp(t4, t5, Address(d, 6 * unit));
__ stp(t6, t7, Address(__ pre(d, 8 * unit)));
}
{
Label L1, L2;
__ tbz(count, exact_log2(4), L1);
if (UseSIMDForMemoryOps) {
__ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
__ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
} else {
__ ldp(t0, t1, Address(s, 2 * unit));
__ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
__ stp(t0, t1, Address(d, 2 * unit));
__ stp(t2, t3, Address(__ pre(d, 4 * unit)));
}
__ bind(L1);
if (direction == copy_forwards) {
__ add(s, s, bias);
__ add(d, d, bias);
}
__ tbz(count, 1, L2);
__ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
__ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
__ bind(L2);
}
__ ret(lr);
if (AvoidUnalignedAccesses) {
Label drain, again;
// Register order for storing. Order is different for backward copy.
__ bind(unaligned_copy_long);
// source address is even aligned, target odd aligned
//
// when forward copying word pairs we read long pairs at offsets
// {0, 2, 4, 6} (in long words). when backwards copying we read
// long pairs at offsets {-2, -4, -6, -8}. We adjust the source
// address by -2 in the forwards case so we can compute the
// source offsets for both as {2, 4, 6, 8} * unit where unit = 1
// or -1.
//
// when forward copying we need to store 1 word, 3 pairs and
// then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
// zero offset We adjust the destination by -1 which means we
// have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
//
// When backwards copyng we need to store 1 word, 3 pairs and
// then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
// offsets {1, 3, 5, 7, 8} * unit.
if (direction == copy_forwards) {
__ sub(s, s, 16);
__ sub(d, d, 8);
}
// Fill 8 registers
//
// for forwards copy s was offset by -16 from the original input
// value of s so the register contents are at these offsets
// relative to the 64 bit block addressed by that original input
// and so on for each successive 64 byte block when s is updated
//
// t0 at offset 0, t1 at offset 8
// t2 at offset 16, t3 at offset 24
// t4 at offset 32, t5 at offset 40
// t6 at offset 48, t7 at offset 56
// for backwards copy s was not offset so the register contents
// are at these offsets into the preceding 64 byte block
// relative to that original input and so on for each successive
// preceding 64 byte block when s is updated. this explains the
// slightly counter-intuitive looking pattern of register usage
// in the stp instructions for backwards copy.
//
// t0 at offset -16, t1 at offset -8
// t2 at offset -32, t3 at offset -24
// t4 at offset -48, t5 at offset -40
// t6 at offset -64, t7 at offset -56
__ ldp(t0, t1, Address(s, 2 * unit));
__ ldp(t2, t3, Address(s, 4 * unit));
__ ldp(t4, t5, Address(s, 6 * unit));
__ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
__ subs(count, count, 16);
__ br(Assembler::LO, drain);
int prefetch = PrefetchCopyIntervalInBytes;
bool use_stride = false;
if (direction == copy_backwards) {
use_stride = prefetch > 256;
prefetch = -prefetch;
if (use_stride) __ mov(stride, prefetch);
}
__ bind(again);
if (PrefetchCopyIntervalInBytes > 0)
__ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
if (direction == copy_forwards) {
// allowing for the offset of -8 the store instructions place
// registers into the target 64 bit block at the following
// offsets
//
// t0 at offset 0
// t1 at offset 8, t2 at offset 16
// t3 at offset 24, t4 at offset 32
// t5 at offset 40, t6 at offset 48
// t7 at offset 56
__ str(t0, Address(d, 1 * unit));
__ stp(t1, t2, Address(d, 2 * unit));
__ ldp(t0, t1, Address(s, 2 * unit));
__ stp(t3, t4, Address(d, 4 * unit));
__ ldp(t2, t3, Address(s, 4 * unit));
__ stp(t5, t6, Address(d, 6 * unit));
__ ldp(t4, t5, Address(s, 6 * unit));
__ str(t7, Address(__ pre(d, 8 * unit)));
__ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
} else {
// d was not offset when we started so the registers are
// written into the 64 bit block preceding d with the following
// offsets
//
// t1 at offset -8
// t3 at offset -24, t0 at offset -16
// t5 at offset -48, t2 at offset -32
// t7 at offset -56, t4 at offset -48
// t6 at offset -64
//
// note that this matches the offsets previously noted for the
// loads
__ str(t1, Address(d, 1 * unit));
__ stp(t3, t0, Address(d, 3 * unit));
__ ldp(t0, t1, Address(s, 2 * unit));
__ stp(t5, t2, Address(d, 5 * unit));
__ ldp(t2, t3, Address(s, 4 * unit));
__ stp(t7, t4, Address(d, 7 * unit));
__ ldp(t4, t5, Address(s, 6 * unit));
__ str(t6, Address(__ pre(d, 8 * unit)));
__ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
}
__ subs(count, count, 8);
__ br(Assembler::HS, again);
// Drain
//
// this uses the same pattern of offsets and register arguments
// as above
__ bind(drain);
if (direction == copy_forwards) {
__ str(t0, Address(d, 1 * unit));
__ stp(t1, t2, Address(d, 2 * unit));
__ stp(t3, t4, Address(d, 4 * unit));
__ stp(t5, t6, Address(d, 6 * unit));
__ str(t7, Address(__ pre(d, 8 * unit)));
} else {
__ str(t1, Address(d, 1 * unit));
__ stp(t3, t0, Address(d, 3 * unit));
__ stp(t5, t2, Address(d, 5 * unit));
__ stp(t7, t4, Address(d, 7 * unit));
__ str(t6, Address(__ pre(d, 8 * unit)));
}
// now we need to copy any remaining part block which may
// include a 4 word block subblock and/or a 2 word subblock.
// bits 2 and 1 in the count are the tell-tale for whetehr we
// have each such subblock
{
Label L1, L2;
__ tbz(count, exact_log2(4), L1);
// this is the same as above but copying only 4 longs hence
// with ony one intervening stp between the str instructions
// but note that the offsets and registers still follow the
// same pattern
__ ldp(t0, t1, Address(s, 2 * unit));
__ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
if (direction == copy_forwards) {
__ str(t0, Address(d, 1 * unit));
__ stp(t1, t2, Address(d, 2 * unit));