/
sgen-gc.c
4030 lines (3751 loc) · 131 KB
/
sgen-gc.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* sgen-gc.c: Simple generational GC.
*
* Author:
* Paolo Molaro (lupus@ximian.com)
*
* Copyright (C) 2005-2006 Novell, Inc
*
* Thread start/stop adapted from Boehm's GC:
* Copyright (c) 1994 by Xerox Corporation. All rights reserved.
* Copyright (c) 1996 by Silicon Graphics. All rights reserved.
* Copyright (c) 1998 by Fergus Henderson. All rights reserved.
* Copyright (c) 2000-2004 by Hewlett-Packard Company. All rights reserved.
*
* THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
* OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
*
* Permission is hereby granted to use or copy this program
* for any purpose, provided the above notices are retained on all copies.
* Permission to modify the code and to distribute modified code is granted,
* provided the above notices are retained, and a notice that the code was
* modified is included with the above copyright notice.
*
* All the rest of the code is LGPL.
*
* Important: allocation provides always zeroed memory, having to do
* a memset after allocation is deadly for performance.
* Memory usage at startup is currently as follows:
* 64 KB pinned space
* 64 KB internal space
* size of nursery
* We should provide a small memory config with half the sizes
*
* We currently try to make as few mono assumptions as possible:
* 1) 2-word header with no GC pointers in it (firts vtable, second to store the
* forwarding ptr)
* 2) gc descriptor is the second word in the vtable (first word in the class)
* 3) 8 byte alignment is the minimum and enough (not true for special structures, FIXME)
* 4) there is a function to get an object's size and the number of
* elements in an array.
* 5) we know the special way bounds are allocated for complex arrays
*
* Always try to keep stack usage to a minimum: no recursive behaviour
* and no large stack allocs.
*
* General description.
* Objects are initially allocated in a nursery using a fast bump-pointer technique.
* When the nursery is full we start a nursery collection: this is performed with a
* copying GC.
* When the old generation is full we start a copying GC of the old generation as well:
* this will be changed to mark/compact in the future.
* The things that complicate this description are:
* *) pinned objects: we can't move them so we need to keep track of them
* *) no precise info of the thread stacks and registers: we need to be able to
* quickly find the objects that may be referenced conservatively and pin them
* (this makes the first issues more important)
* *) large objects are too expensive to be dealt with using copying GC: we handle them
* with mark/sweep during major collections
* *) some objects need to not move even if they are small (interned strings, Type handles):
* we use mark/sweep for them, too: they are not allocated in the nursery, but inside
* PinnedChunks regions
*/
/*
* TODO:
*) change the jit to emit write barrier calls when needed (we
can have specialized write barriers): done with icalls, still need to
use some specialized barriers
*) we could have a function pointer in MonoClass to implement
customized write barriers for value types
*) the write barrier code could be isolated in a couple of functions: when a
thread is stopped if it's inside the barrier it is let go again
until we stop outside of them (not really needed, see below GC-safe points)
*) investigate the stuff needed to advance a thread to a GC-safe
point (single-stepping, read from unmapped memory etc) and implement it
Not needed yet: since we treat the objects reachable from the stack/regs as
roots, we store the ptr and exec the write barrier so there is no race.
We may need this to solve the issue with setting the length of arrays and strings.
We may need this also for handling precise info on stacks, even simple things
as having uninitialized data on the stack and having to wait for the prolog
to zero it. Not an issue for the last frame that we scan conservatively.
We could always not trust the value in the slots anyway.
*) make the jit info table lock free
*) modify the jit to save info about references in stack locations:
this can be done just for locals as a start, so that at least
part of the stack is handled precisely.
*) Make the debug printf stuff thread and signal safe.
*) test/fix 64 bit issues
*) test/fix endianess issues
*) port to non-Linux
*) add batch moving profile info
*) add more timing info
*) there is a possible race when an array or string is created: the vtable is set,
but the length is set only later so if the GC needs to scan the object in that window,
it won't get the correct size for the object. The object can't have references and it will
be pinned, but a free memory fragment may be created that overlaps with it.
We should change the array max_length field to be at the same offset as the string length:
this way we can have a single special alloc function for them that sets the length.
Multi-dim arrays have the same issue for rank == 1 for the bounds data.
*) implement a card table as the write barrier instead of remembered sets?
*) some sort of blacklist support?
*) fin_ready_list is part of the root set, too
*) consider lowering the large object min size to 16/32KB or so and benchmark
*) once mark-compact is implemented we could still keep the
copying collector for the old generation and use it if we think
it is better (small heaps and no pinning object in the old
generation)
*) avoid the memory store from copy_object when not needed.
*) optimize the write barriers fastpath to happen in managed code
*) add an option to mmap the whole heap in one chunk: it makes for many
simplifications in the checks (put the nursery at the top and just use a single
check for inclusion/exclusion): the issue this has is that on 32 bit systems it's
not flexible (too much of the address space may be used by default or we can't
increase the heap as needed) and we'd need a race-free mechanism to return memory
back to the system (mprotect(PROT_NONE) will still keep the memory allocated if it
was written to, munmap is needed, but the following mmap may not find the same segment
free...)
*) memzero the fragments after restarting the world and optionally a smaller chunk at a time
*) an additional strategy to realloc/expand the nursery when fully pinned is to start
allocating objects in the old generation. This means that we can't optimize away write
barrier calls in ctors (but that is not valid for other reasons, too).
*) add write barriers to the Clone methods
*/
#include "config.h"
#ifdef HAVE_SGEN_GC
#include <unistd.h>
#include <stdio.h>
#include <string.h>
#include <pthread.h>
#include <semaphore.h>
#include <signal.h>
#include <errno.h>
#include <assert.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <time.h>
#include <fcntl.h>
#include "metadata/metadata-internals.h"
#include "metadata/class-internals.h"
#include "metadata/gc-internal.h"
#include "metadata/object-internals.h"
#include "metadata/threads.h"
#include "metadata/sgen-gc.h"
#include "metadata/mono-gc.h"
/*
* ######################################################################
* ######## Types and constants used by the GC.
* ######################################################################
*/
#if SIZEOF_VOID_P == 4
typedef guint32 mword;
#else
typedef guint64 mword;
#endif
static int gc_initialized = 0;
static int gc_debug_level = 0;
static FILE* gc_debug_file;
void
mono_gc_flush_info (void)
{
fflush (gc_debug_file);
}
#define MAX_DEBUG_LEVEL 9
#define DEBUG(level,a) do {if ((level) <= MAX_DEBUG_LEVEL && (level) <= gc_debug_level) a;} while (0)
#define TV_ELAPSED(start,end) ((((end).tv_sec - (start).tv_sec) * 1000000) + end.tv_usec - start.tv_usec)
#define GC_BITS_PER_WORD (sizeof (mword) * 8)
enum {
MEMORY_ROLE_GEN0,
MEMORY_ROLE_GEN1,
MEMORY_ROLE_GEN2,
MEMORY_ROLE_FIXED,
MEMORY_ROLE_INTERNAL
};
/* each request from the OS ends up in a GCMemSection */
typedef struct _GCMemSection GCMemSection;
struct _GCMemSection {
GCMemSection *next;
char *data;
mword size;
/* pointer where more data could be allocated if it fits */
char *next_data;
char *end_data;
/*
* scan starts is an array of pointers to objects equally spaced in the allocation area
* They let use quickly find pinned objects from pinning pointers.
*/
char **scan_starts;
/* in major collections indexes in the pin_queue for objects that pin this section */
int pin_queue_start;
int pin_queue_end;
unsigned short num_scan_start;
unsigned char role;
};
/* large object space struct: 64+ KB */
/* we could make this limit much smaller to avoid memcpy copy
* and potentially have more room in the GC descriptor: need to measure
* This also means that such small OS objects will need to be
* allocated in a different way (using pinned chunks).
* We may want to put large but smaller than 64k objects in the fixed space
* when we move the object from one generation to another (to limit the
* pig in the snake effect).
* Note: it may be worth to have an optimized copy function, since we can
* assume that objects are aligned and have a multiple of 8 size.
* FIXME: This structure needs to be a multiple of 8 bytes in size: this is not
* true if MONO_ZERO_LEN_ARRAY is nonzero.
*/
typedef struct _LOSObject LOSObject;
struct _LOSObject {
LOSObject *next;
mword size; /* this is the object size */
int dummy; /* to have a sizeof (LOSObject) a multiple of ALLOC_ALIGN */
unsigned char role;
char *data [MONO_ZERO_LEN_ARRAY];
};
/* Pinned objects are allocated in the LOS space if bigger than half a page
* or from freelists otherwise. We assume that pinned objects are relatively few
* and they have a slow dying speed (like interned strings, thread objects).
* As such they will be collected only at major collections.
* free lists are not global: when we need memory we allocate a PinnedChunk.
* Each pinned chunk is made of several pages, the first of wich is used
* internally for bookeeping (here think of a page as 4KB). The bookeeping
* includes the freelists vectors and info about the object size of each page
* in the pinned chunk. So, when needed, a free page is found in a pinned chunk,
* a size is assigned to it, the page is divided in the proper chunks and each
* chunk is added to the freelist. To not waste space, the remaining space in the
* first page is used as objects of size 16 or 32 (need to measure which are more
* common).
* We use this same structure to allocate memory used internally by the GC, so
* we never use malloc/free if we need to alloc during collection: the world is stopped
* and malloc/free will deadlock.
* When we want to iterate over pinned objects, we just scan a page at a time
* linearly according to the size of objects in the page: the next pointer used to link
* the items in the freelist uses the same word as the vtable. Since we keep freelists
* for each pinned chunk, if the word points outside the pinned chunk it means
* it is an object.
* We could avoid this expensive scanning in creative ways. We could have a policy
* of putting in the pinned space only objects we know about that have no struct fields
* with references and we can easily use a even expensive write barrier for them,
* since pointer writes on such objects should be rare.
* The best compromise is to just alloc interned strings and System.MonoType in them.
* It would be nice to allocate MonoThread in it, too: must check that we properly
* use write barriers so we don't have to do any expensive scanning of the whole pinned
* chunk list during minor collections. We can avoid it now because we alloc in it only
* reference-free objects.
*/
#define PINNED_FIRST_SLOT_SIZE (sizeof (gpointer) * 4)
#define MAX_FREELIST_SIZE 2048
#define PINNED_PAGE_SIZE (4096)
#define PINNED_CHUNK_MIN_SIZE (4096*8)
typedef struct _PinnedChunk PinnedChunk;
struct _PinnedChunk {
PinnedChunk *next;
int num_pages;
int *page_sizes; /* a 0 means the page is still unused */
void **free_list;
void *start_data;
void *data [1]; /* page sizes and free lists are stored here */
};
/*
* The young generation is divided into fragments. This is because
* we can hand one fragments to a thread for lock-less fast alloc and
* because the young generation ends up fragmented anyway by pinned objects.
* Once a collection is done, a list of fragments is created. When doing
* thread local alloc we use smallish nurseries so we allow new threads to
* allocate memory from gen0 without triggering a collection. Threads that
* are found to allocate lots of memory are given bigger fragments. This
* should make the finalizer thread use little nursery memory after a while.
* We should start assigning threads very small fragments: if there are many
* threads the nursery will be full of reserved space that the threads may not
* use at all, slowing down allocation speed.
*/
typedef struct _Fragment Fragment;
struct _Fragment {
Fragment *next;
char *fragment_start;
char *fragment_limit; /* the current soft limit for allocation */
char *fragment_end;
};
/* the runtime can register areas of memory as roots: we keep two lists of roots,
* a pinned root set for conservatively scanned roots and a normal one for
* precisely scanned roots (currently implemented as a single list).
*/
typedef struct _RootRecord RootRecord;
struct _RootRecord {
RootRecord *next;
char *start_root;
char *end_root;
mword root_desc;
};
/* for use with write barriers */
typedef struct _RememberedSet RememberedSet;
struct _RememberedSet {
mword *store_next;
mword *end_set;
RememberedSet *next;
mword data [MONO_ZERO_LEN_ARRAY];
};
/* we have 4 possible values in the low 2 bits */
enum {
REMSET_LOCATION, /* just a pointer to the exact location */
REMSET_RANGE, /* range of pointer fields */
REMSET_OBJECT, /* mark all the object for scanning */
REMSET_TYPE_MASK = 0x3
};
static __thread RememberedSet *remembered_set MONO_TLS_FAST;
static RememberedSet *global_remset;
static int store_to_global_remset = 0;
/* FIXME: later choose a size that takes into account the RememberedSet struct
* and doesn't waste any alloc paddin space.
*/
#define DEFAULT_REMSET_SIZE 1024
static RememberedSet* alloc_remset (int size, gpointer id);
/* Structure that corresponds to a MonoVTable: desc is a mword so requires
* no cast from a pointer to an integer
*/
typedef struct {
MonoClass *klass;
mword desc;
} GCVTable;
/* these bits are set in the object vtable: we could merge them since an object can be
* either pinned or forwarded but not both.
* We store them in the vtable slot because the bits are used in the sync block for
* other purpouses: if we merge them and alloc the sync blocks aligned to 8 bytes, we can change
* this and use bit 3 in the syncblock (with the lower two bits both set for forwarded, that
* would be an invalid combination for the monitor and hash code).
* The values are already shifted.
* The forwarding address is stored in the sync block.
*/
#define FORWARDED_BIT 1
#define PINNED_BIT 2
#define VTABLE_BITS_MASK 0x3
/* returns NULL if not forwarded, or the forwarded address */
#define object_is_forwarded(obj) (((mword*)(obj))[0] & FORWARDED_BIT? (void*)(((mword*)(obj))[1]): NULL)
/* set the forwarded address fw_addr for object obj */
#define forward_object(obj,fw_addr) do { \
((mword*)(obj))[0] |= FORWARDED_BIT; \
((mword*)(obj))[1] = (mword)(fw_addr); \
} while (0)
#define object_is_pinned(obj) (((mword*)(obj))[0] & PINNED_BIT)
#define pin_object(obj) do { \
((mword*)(obj))[0] |= PINNED_BIT; \
} while (0)
#define unpin_object(obj) do { \
((mword*)(obj))[0] &= ~PINNED_BIT; \
} while (0)
/*
* Since we set bits in the vtable, use the macro to load it from the pointer to
* an object that is potentially pinned.
*/
#define LOAD_VTABLE(addr) ((*(mword*)(addr)) & ~VTABLE_BITS_MASK)
static const char*
safe_name (void* obj)
{
MonoVTable *vt = (MonoVTable*)LOAD_VTABLE (obj);
return vt->klass->name;
}
static guint
safe_object_get_size (MonoObject* o)
{
MonoClass *klass = ((MonoVTable*)LOAD_VTABLE (o))->klass;
if (klass == mono_defaults.string_class) {
return sizeof (MonoString) + 2 * mono_string_length ((MonoString*) o) + 2;
} else if (klass->rank) {
MonoArray *array = (MonoArray*)o;
size_t size = sizeof (MonoArray) + mono_array_element_size (klass) * mono_array_length (array);
if (array->bounds) {
size += 3;
size &= ~3;
size += sizeof (MonoArrayBounds) * klass->rank;
}
return size;
} else {
/* from a created object: the class must be inited already */
return klass->instance_size;
}
}
/*
* ######################################################################
* ######## Global data.
* ######################################################################
*/
static pthread_mutex_t gc_mutex = PTHREAD_MUTEX_INITIALIZER;
static int gc_disabled = 0;
static int num_minor_gcs = 0;
static int num_major_gcs = 0;
/* good sizes are 512KB-1MB: larger ones increase a lot memzeroing time */
//#define DEFAULT_NURSERY_SIZE (1024*512*125+4096*118)
#define DEFAULT_NURSERY_SIZE (1024*512*2)
#define DEFAULT_MAX_SECTION (DEFAULT_NURSERY_SIZE * 16)
#define DEFAULT_LOS_COLLECTION_TARGET (DEFAULT_NURSERY_SIZE * 2)
/* to quickly find the heard of an object pinned by a conservative address
* we keep track of the objects allocated for each SCAN_START_SIZE memory
* chunk in the nursery or other memory sections. Larger values have less
* memory overhead and bigger runtime cost. 4-8 KB are reasonable values.
*/
#define SCAN_START_SIZE (4096*2)
/* the minimum size of a fragment that we consider useful for allocation */
#define FRAGMENT_MIN_SIZE (512)
/* This is a fixed value used for pinned chunks, not the system pagesize */
#define FREELIST_PAGESIZE 4096
static mword pagesize = 4096; /* FIXME */
static mword nursery_size = DEFAULT_NURSERY_SIZE;
static mword next_section_size = DEFAULT_NURSERY_SIZE * 4;
static mword max_section_size = DEFAULT_MAX_SECTION;
static int section_size_used = 0;
static LOSObject *los_object_list = NULL;
static mword los_memory_usage = 0;
static mword los_num_objects = 0;
static mword next_los_collection = 2*1024*1024; /* 2 MB, need to tune */
static mword total_alloc = 0;
/* use this to tune when to do a major/minor collection */
static mword memory_pressure = 0;
static GCMemSection *section_list = NULL;
static GCMemSection *nursery_section = NULL;
static mword lowest_heap_address = ~(mword)0;
static mword highest_heap_address = 0;
typedef struct _FinalizeEntry FinalizeEntry;
struct _FinalizeEntry {
FinalizeEntry *next;
void *object;
void *data; /* can be a disappearing link or the data for the finalizer */
/* Note we could use just one pointer if we don't support multiple callbacks
* for finalizers and per-finalizer data and if we store the obj pointers
* in the link like libgc does
*/
};
/*
* The finalizable hash has the object as the key, the
* disappearing_link hash, has the link address as key.
*/
static FinalizeEntry **finalizable_hash = NULL;
/* objects that are ready to be finalized */
static FinalizeEntry *fin_ready_list = NULL;
/* disappearing links use the same structure but a different list */
static FinalizeEntry **disappearing_link_hash = NULL;
static mword disappearing_link_hash_size = 0;
static mword finalizable_hash_size = 0;
static mword num_registered_finalizers = 0;
static mword num_ready_finalizers = 0;
static mword num_disappearing_links = 0;
static int no_finalize = 0;
/* keep each size a multiple of ALLOC_ALIGN */
/* on 64 bit systems 8 is likely completely unused. */
static const int freelist_sizes [] = {
8, 16, 24, 32, 40, 48, 64, 80,
96, 128, 160, 192, 224, 256, 320, 384,
448, 512, 584, 680, 816, 1024, 1360, 2048};
#define FREELIST_NUM_SLOTS (sizeof (freelist_sizes) / sizeof (freelist_sizes [0]))
static char* max_pinned_chunk_addr = NULL;
static char* min_pinned_chunk_addr = (char*)-1;
/* pinned_chunk_list is used for allocations of objects that are never moved */
static PinnedChunk *pinned_chunk_list = NULL;
/* internal_chunk_list is used for allocating structures needed by the GC */
static PinnedChunk *internal_chunk_list = NULL;
static gboolean
obj_is_from_pinned_alloc (char *p)
{
PinnedChunk *chunk = pinned_chunk_list;
for (; chunk; chunk = chunk->next) {
if (p >= chunk->start_data && p < ((char*)chunk + chunk->num_pages * FREELIST_PAGESIZE))
return TRUE;
}
return FALSE;
}
/* registered roots: the key to the hash is the root start address */
static RootRecord **roots_hash = NULL;
static int roots_hash_size = 0;
static mword roots_size = 0; /* amount of memory in the root set */
static mword num_roots_entries = 0;
/*
* The current allocation cursors
* We allocate objects in the nursery.
* The nursery is the area between nursery_start and nursery_real_end.
* nursery_next is the pointer to the space where the next object will be allocated.
* nursery_temp_end is the pointer to the end of the temporary space reserved for
* the allocation: this allows us to allow allocations inside the fragments of the
* nursery (the empty holes between pinned objects) and it allows us to set the
* scan starts at reasonable intervals.
* nursery_next and nursery_temp_end will become per-thread vars to allow lock-free
* allocations.
* nursery_first_pinned_start points to the start of the first pinned object in the nursery
* nursery_last_pinned_end points to the end of the last pinned object in the nursery
* At the next allocation, the area of the nursery where objects can be present is
* between MIN(nursery_first_pinned_start, first_fragment_start) and
* MAX(nursery_last_pinned_end, nursery_temp_end)
*/
static char *nursery_start = NULL;
static char *nursery_next = NULL;
static char *nursery_temp_end = NULL;
static char *nursery_real_end = NULL;
static char *nursery_frag_real_end = NULL;
static char *nursery_first_pinned_start = NULL;
static char *nursery_last_pinned_end = NULL;
/* fragments that are free and ready to be used for allocation */
static Fragment *nursery_fragments = NULL;
/* freeelist of fragment structures */
static Fragment *fragment_freelist = NULL;
/*
* used when moving the objects
* When the nursery is collected, objects are copied to to_space.
* The area between to_space and gray_objects is used as a stack
* of objects that need their fields checked for more references
* to be copied.
* We should optimize somehow this mechanism to avoid rescanning
* ptr-free objects. The order is also probably not optimal: need to
* test cache misses and other graph traversal orders.
*/
static char *to_space = NULL;
static char *gray_objects = NULL;
static char *to_space_end = NULL;
static GCMemSection *to_space_section = NULL;
/* objects bigger then this go into the large object space */
#define MAX_SMALL_OBJ_SIZE 0xffff
/*
* ######################################################################
* ######## Macros and function declarations.
* ######################################################################
*/
/*
* Recursion is not allowed for the thread lock.
*/
#define LOCK_GC pthread_mutex_lock (&gc_mutex)
#define UNLOCK_GC pthread_mutex_unlock (&gc_mutex)
#define UPDATE_HEAP_BOUNDARIES(low,high) do { \
if ((mword)(low) < lowest_heap_address) \
lowest_heap_address = (mword)(low); \
if ((mword)(high) > highest_heap_address) \
highest_heap_address = (mword)(high); \
} while (0)
inline static void*
align_pointer (void *ptr)
{
mword p = (mword)ptr;
p += sizeof (gpointer) - 1;
p &= ~ (sizeof (gpointer) - 1);
return (void*)p;
}
/* forward declarations */
static void* get_internal_mem (size_t size);
static void free_internal_mem (void *addr);
static void* get_os_memory (size_t size, int activate);
static void free_os_memory (void *addr, size_t size);
static void report_internal_mem_usage (void);
static int stop_world (void);
static int restart_world (void);
static void pin_thread_data (void *start_nursery, void *end_nursery);
static void scan_from_remsets (void *start_nursery, void *end_nursery);
static void find_pinning_ref_from_thread (char *obj, size_t size);
static void update_current_thread_stack (void *start);
static GCMemSection* alloc_section (size_t size);
static void finalize_in_range (void **start, void **end);
static void null_link_in_range (void **start, void **end);
static gboolean search_fragment_for_size (size_t size);
static void mark_pinned_from_addresses (PinnedChunk *chunk, void **start, void **end);
static void clear_remsets (void);
static void sweep_pinned_objects (void);
static void free_large_object (LOSObject *obj);
static void free_mem_section (GCMemSection *section);
/*
* ######################################################################
* ######## GC descriptors
* ######################################################################
* Used to quickly get the info the GC needs about an object: size and
* where the references are held.
*/
/* objects are aligned to 8 bytes boundaries
* A descriptor is a pointer in MonoVTable, so 32 or 64 bits of size.
* The low 3 bits define the type of the descriptor. The other bits
* depend on the type.
* As a general rule the 13 remaining low bits define the size, either
* of the whole object or of the elements in the arrays. While for objects
* the size is already in bytes, for arrays we need to shift, because
* array elements might be smaller than 8 bytes. In case of arrays, we
* use two bits to describe what the additional high bits represents,
* so the default behaviour can handle element sizes less than 2048 bytes.
* The high 16 bits, if 0 it means the object is pointer-free.
* This design should make it easy and fast to skip over ptr-free data.
* The first 4 types should cover >95% of the objects.
* Note that since the size of objects is limited to 64K, larger objects
* will be allocated in the large object heap.
* If we want 4-bytes alignment, we need to put vector and small bitmap
* inside complex.
*/
enum {
DESC_TYPE_RUN_LENGTH, /* 16 bits aligned byte size | 1-3 (offset, numptr) bytes tuples */
DESC_TYPE_SMALL_BITMAP, /* 16 bits aligned byte size | 16-48 bit bitmap */
DESC_TYPE_STRING, /* nothing */
DESC_TYPE_COMPLEX, /* index for bitmap into complex_descriptors */
DESC_TYPE_VECTOR, /* 10 bits element size | 1 bit array | 2 bits desc | element desc */
DESC_TYPE_ARRAY, /* 10 bits element size | 1 bit array | 2 bits desc | element desc */
DESC_TYPE_LARGE_BITMAP, /* | 29-61 bitmap bits */
DESC_TYPE_COMPLEX_ARR, /* index for bitmap into complex_descriptors */
/* subtypes for arrays and vectors */
DESC_TYPE_V_PTRFREE = 0,/* there are no refs: keep first so it has a zero value */
DESC_TYPE_V_REFS, /* all the array elements are refs */
DESC_TYPE_V_RUN_LEN, /* elements are run-length encoded as DESC_TYPE_RUN_LENGTH */
DESC_TYPE_V_BITMAP /* elements are as the bitmap in DESC_TYPE_SMALL_BITMAP */
};
#define OBJECT_HEADER_WORDS (sizeof(MonoObject)/sizeof(gpointer))
#define LOW_TYPE_BITS 3
#define SMALL_BITMAP_SHIFT 16
#define SMALL_BITMAP_SIZE (GC_BITS_PER_WORD - SMALL_BITMAP_SHIFT)
#define VECTOR_INFO_SHIFT 14
#define VECTOR_ELSIZE_SHIFT 3
#define LARGE_BITMAP_SIZE (GC_BITS_PER_WORD - LOW_TYPE_BITS)
#define MAX_SMALL_SIZE ((1 << SMALL_BITMAP_SHIFT) - 1)
#define SMALL_SIZE_MASK 0xfff8
#define MAX_ELEMENT_SIZE 0x3ff
#define ELEMENT_SIZE_MASK (0x3ff << LOW_TYPE_BITS)
#define VECTOR_SUBTYPE_PTRFREE (DESC_TYPE_V_PTRFREE << VECTOR_INFO_SHIFT)
#define VECTOR_SUBTYPE_REFS (DESC_TYPE_V_REFS << VECTOR_INFO_SHIFT)
#define VECTOR_SUBTYPE_RUN_LEN (DESC_TYPE_V_RUN_LEN << VECTOR_INFO_SHIFT)
#define VECTOR_SUBTYPE_BITMAP (DESC_TYPE_V_BITMAP << VECTOR_INFO_SHIFT)
#define ALLOC_ALIGN 8
/* Root bitmap descriptors are simpler: the lower two bits describe the type
* and we either have 30/62 bitmap bits or nibble-based run-length,
* or a complex descriptor
*/
enum {
ROOT_DESC_CONSERVATIVE, /* 0, so matches NULL value */
ROOT_DESC_BITMAP,
ROOT_DESC_RUN_LEN,
ROOT_DESC_LARGE_BITMAP,
ROOT_DESC_TYPE_MASK = 0x3,
ROOT_DESC_TYPE_SHIFT = 2,
};
static gsize* complex_descriptors = NULL;
static int complex_descriptors_size = 0;
static int complex_descriptors_next = 0;
static int
alloc_complex_descriptor (gsize *bitmap, int numbits)
{
int nwords = numbits/GC_BITS_PER_WORD + 2;
int res;
int i;
LOCK_GC;
res = complex_descriptors_next;
/* linear search, so we don't have duplicates with domain load/unload
* this should not be performance critical or we'd have bigger issues
* (the number and size of complex descriptors should be small).
*/
for (i = 0; i < complex_descriptors_next; ) {
if (complex_descriptors [i] == nwords) {
int j, found = TRUE;
for (j = 0; j < nwords - 1; ++j) {
if (complex_descriptors [i + 1 + j] != bitmap [j]) {
found = FALSE;
break;
}
}
if (found) {
UNLOCK_GC;
return i;
}
}
i += complex_descriptors [i];
}
if (complex_descriptors_next + nwords > complex_descriptors_size) {
int new_size = complex_descriptors_size * 2 + nwords;
complex_descriptors = g_realloc (complex_descriptors, new_size * sizeof (gsize));
complex_descriptors_size = new_size;
}
DEBUG (6, fprintf (gc_debug_file, "Complex descriptor %d, size: %d (total desc memory: %d)\n", res, nwords, complex_descriptors_size));
complex_descriptors_next += nwords;
complex_descriptors [res] = nwords;
for (i = 0; i < nwords - 1; ++i) {
complex_descriptors [res + 1 + i] = bitmap [i];
DEBUG (6, fprintf (gc_debug_file, "\tvalue: %p\n", (void*)complex_descriptors [res + 1 + i]));
}
UNLOCK_GC;
return res;
}
/*
* Descriptor builders.
*/
void*
mono_gc_make_descr_for_string (void)
{
return (void*) DESC_TYPE_STRING;
}
void*
mono_gc_make_descr_for_object (gsize *bitmap, int numbits, size_t obj_size)
{
int first_set = -1, num_set = 0, last_set = -1, i;
mword desc = 0;
size_t stored_size = obj_size;
stored_size += ALLOC_ALIGN - 1;
stored_size &= ~(ALLOC_ALIGN - 1);
for (i = 0; i < numbits; ++i) {
if (bitmap [i / GC_BITS_PER_WORD] & (1 << (i % GC_BITS_PER_WORD))) {
if (first_set < 0)
first_set = i;
last_set = i;
num_set++;
}
}
if (stored_size <= MAX_SMALL_OBJ_SIZE) {
/* check run-length encoding first: one byte offset, one byte number of pointers
* on 64 bit archs, we can have 3 runs, just one on 32.
* It may be better to use nibbles.
*/
if (first_set < 0) {
desc = DESC_TYPE_RUN_LENGTH | stored_size;
DEBUG (6, fprintf (gc_debug_file, "Ptrfree descriptor %p, size: %d\n", (void*)desc, stored_size));
return (void*) desc;
} else if (first_set < 256 && num_set < 256 && (first_set + num_set == last_set + 1)) {
desc = DESC_TYPE_RUN_LENGTH | stored_size | (first_set << 16) | (num_set << 24);
DEBUG (6, fprintf (gc_debug_file, "Runlen descriptor %p, size: %d, first set: %d, num set: %d\n", (void*)desc, stored_size, first_set, num_set));
return (void*) desc;
}
/* we know the 2-word header is ptr-free */
if (last_set < SMALL_BITMAP_SIZE + OBJECT_HEADER_WORDS) {
desc = DESC_TYPE_SMALL_BITMAP | stored_size | ((*bitmap >> OBJECT_HEADER_WORDS) << SMALL_BITMAP_SHIFT);
DEBUG (6, fprintf (gc_debug_file, "Smallbitmap descriptor %p, size: %d, last set: %d\n", (void*)desc, stored_size, last_set));
return (void*) desc;
}
}
/* we know the 2-word header is ptr-free */
if (last_set < LARGE_BITMAP_SIZE + OBJECT_HEADER_WORDS) {
desc = DESC_TYPE_LARGE_BITMAP | ((*bitmap >> OBJECT_HEADER_WORDS) << LOW_TYPE_BITS);
DEBUG (6, fprintf (gc_debug_file, "Largebitmap descriptor %p, size: %d, last set: %d\n", (void*)desc, stored_size, last_set));
return (void*) desc;
}
/* it's a complex object ... */
desc = DESC_TYPE_COMPLEX | (alloc_complex_descriptor (bitmap, last_set + 1) << LOW_TYPE_BITS);
return (void*) desc;
}
/* If the array holds references, numbits == 1 and the first bit is set in elem_bitmap */
void*
mono_gc_make_descr_for_array (int vector, gsize *elem_bitmap, int numbits, size_t elem_size)
{
int first_set = -1, num_set = 0, last_set = -1, i;
mword desc = vector? DESC_TYPE_VECTOR: DESC_TYPE_ARRAY;
for (i = 0; i < numbits; ++i) {
if (elem_bitmap [i / GC_BITS_PER_WORD] & (1 << (i % GC_BITS_PER_WORD))) {
if (first_set < 0)
first_set = i;
last_set = i;
num_set++;
}
}
if (elem_size <= MAX_ELEMENT_SIZE) {
desc |= elem_size << VECTOR_ELSIZE_SHIFT;
if (!num_set) {
return (void*)(desc | VECTOR_SUBTYPE_PTRFREE);
}
/* Note: we also handle structs with just ref fields */
if (num_set * sizeof (gpointer) == elem_size) {
return (void*)(desc | VECTOR_SUBTYPE_REFS | ((-1LL) << 16));
}
/* FIXME: try run-len first */
/* Note: we can't skip the object header here, because it's not present */
if (last_set <= SMALL_BITMAP_SIZE) {
return (void*)(desc | VECTOR_SUBTYPE_BITMAP | (*elem_bitmap << 16));
}
}
/* it's am array of complex structs ... */
desc = DESC_TYPE_COMPLEX_ARR;
desc |= alloc_complex_descriptor (elem_bitmap, last_set + 1) << LOW_TYPE_BITS;
return (void*) desc;
}
/* helper macros to scan and traverse objects, macros because we resue them in many functions */
#define STRING_SIZE(size,str) do { \
(size) = sizeof (MonoString) + 2 * (mono_string_length ((MonoString*)(str)) + 1); \
(size) += (ALLOC_ALIGN - 1); \
(size) &= ~(ALLOC_ALIGN - 1); \
} while (0)
#define OBJ_RUN_LEN_SIZE(size,vt,obj) do { \
(size) = (vt)->desc & 0xfff8; \
} while (0)
#define OBJ_BITMAP_SIZE(size,vt,obj) do { \
(size) = (vt)->desc & 0xfff8; \
} while (0)
//#define PREFETCH(addr) __asm__ __volatile__ (" prefetchnta %0": : "m"(*(char *)(addr)))
#define PREFETCH(addr)
/* code using these macros must define a HANDLE_PTR(ptr) macro that does the work */
#define OBJ_RUN_LEN_FOREACH_PTR(vt,obj) do { \
if ((vt)->desc & 0xffff0000) { \
/* there are pointers */ \
void **_objptr_end; \
void **_objptr = (void**)(obj); \
_objptr += ((vt)->desc >> 16) & 0xff; \
_objptr_end = _objptr + (((vt)->desc >> 24) & 0xff); \
while (_objptr < _objptr_end) { \
HANDLE_PTR (_objptr, (obj)); \
_objptr++; \
} \
} \
} while (0)
/* a bitmap desc means that there are pointer references or we'd have
* choosen run-length, instead: add an assert to check.
*/
#define OBJ_BITMAP_FOREACH_PTR(vt,obj) do { \
/* there are pointers */ \
void **_objptr = (void**)(obj); \
gsize _bmap = (vt)->desc >> 16; \
_objptr += OBJECT_HEADER_WORDS; \
while (_bmap) { \
if ((_bmap & 1)) { \
HANDLE_PTR (_objptr, (obj)); \
} \
_bmap >>= 1; \
++_objptr; \
} \
} while (0)
#define OBJ_LARGE_BITMAP_FOREACH_PTR(vt,obj) do { \
/* there are pointers */ \
void **_objptr = (void**)(obj); \
gsize _bmap = (vt)->desc >> LOW_TYPE_BITS; \
_objptr += OBJECT_HEADER_WORDS; \
while (_bmap) { \
if ((_bmap & 1)) { \
HANDLE_PTR (_objptr, (obj)); \
} \
_bmap >>= 1; \
++_objptr; \
} \
} while (0)
#define OBJ_COMPLEX_FOREACH_PTR(vt,obj) do { \
/* there are pointers */ \
void **_objptr = (void**)(obj); \
gsize *bitmap_data = complex_descriptors + ((vt)->desc >> LOW_TYPE_BITS); \
int bwords = (*bitmap_data) - 1; \
void **start_run = _objptr; \
bitmap_data++; \
if (0) { \
MonoObject *myobj = (MonoObject*)obj; \
g_print ("found %d at %p (0x%x): %s.%s\n", bwords, (obj), (vt)->desc, myobj->vtable->klass->name_space, myobj->vtable->klass->name); \
} \
while (bwords-- > 0) { \
gsize _bmap = *bitmap_data++; \
_objptr = start_run; \
/*g_print ("bitmap: 0x%x/%d at %p\n", _bmap, bwords, _objptr);*/ \
while (_bmap) { \
if ((_bmap & 1)) { \
HANDLE_PTR (_objptr, (obj)); \
} \
_bmap >>= 1; \
++_objptr; \
} \
start_run += GC_BITS_PER_WORD; \
} \
} while (0)
/* this one is untested */
#define OBJ_COMPLEX_ARR_FOREACH_PTR(vt,obj) do { \
/* there are pointers */ \
gsize *mbitmap_data = complex_descriptors + ((vt)->desc >> LOW_TYPE_BITS); \
int mbwords = (*mbitmap_data++) - 1; \
int el_size = mono_array_element_size (((MonoObject*)(obj))->vtable->klass); \
char *e_start = (char*)(obj) + G_STRUCT_OFFSET (MonoArray, vector); \
char *e_end = e_start + el_size * mono_array_length ((MonoArray*)(obj)); \
if (0) { \
MonoObject *myobj = (MonoObject*)start; \
g_print ("found %d at %p (0x%x): %s.%s\n", mbwords, (obj), (vt)->desc, myobj->vtable->klass->name_space, myobj->vtable->klass->name); \
} \
while (e_start < e_end) { \
void **_objptr = (void**)e_start; \
gsize *bitmap_data = mbitmap_data; \
unsigned int bwords = mbwords; \
while (bwords-- > 0) { \
gsize _bmap = *bitmap_data++; \
void **start_run = _objptr; \
/*g_print ("bitmap: 0x%x\n", _bmap);*/ \
while (_bmap) { \
if ((_bmap & 1)) { \
HANDLE_PTR (_objptr, (obj)); \
} \
_bmap >>= 1; \
++_objptr; \
} \
_objptr = start_run + GC_BITS_PER_WORD; \
} \
e_start += el_size; \
} \
} while (0)
#define OBJ_VECTOR_FOREACH_PTR(vt,obj) do { \
/* note: 0xffffc000 excludes DESC_TYPE_V_PTRFREE */ \
if ((vt)->desc & 0xffffc000) { \
int el_size = ((vt)->desc >> 3) & MAX_ELEMENT_SIZE; \
/* there are pointers */ \
int etype = (vt)->desc & 0xc000; \
if (etype == (DESC_TYPE_V_REFS << 14)) { \
void **p = (void**)((char*)(obj) + G_STRUCT_OFFSET (MonoArray, vector)); \
void **end_refs = (void**)((char*)p + el_size * mono_array_length ((MonoArray*)(obj))); \
/* Note: this code can handle also arrays of struct with only references in them */ \
while (p < end_refs) { \
HANDLE_PTR (p, (obj)); \
++p; \
} \
} else if (etype == DESC_TYPE_V_RUN_LEN << 14) { \
int offset = ((vt)->desc >> 16) & 0xff; \
int num_refs = ((vt)->desc >> 24) & 0xff; \
char *e_start = (char*)(obj) + G_STRUCT_OFFSET (MonoArray, vector); \
char *e_end = e_start + el_size * mono_array_length ((MonoArray*)(obj)); \
while (e_start < e_end) { \
void **p = (void**)e_start; \
int i; \
p += offset; \
for (i = 0; i < num_refs; ++i) { \
HANDLE_PTR (p + i, (obj)); \
} \
e_start += el_size; \
} \
} else if (etype == DESC_TYPE_V_BITMAP << 14) { \
char *e_start = (char*)(obj) + G_STRUCT_OFFSET (MonoArray, vector); \
char *e_end = e_start + el_size * mono_array_length ((MonoArray*)(obj)); \
while (e_start < e_end) { \
void **p = (void**)e_start; \
gsize _bmap = (vt)->desc >> 16; \
/* Note: there is no object header here to skip */ \
while (_bmap) { \
if ((_bmap & 1)) { \
HANDLE_PTR (p, (obj)); \
} \
_bmap >>= 1; \
++p; \
} \
e_start += el_size; \
} \
} \
} \
} while (0)
static mword new_obj_references = 0;
static mword obj_references_checked = 0;
#undef HANDLE_PTR
#define HANDLE_PTR(ptr,obj) do { \
if (*(ptr) && (char*)*(ptr) >= nursery_start && (char*)*(ptr) < nursery_next) { \
MonoObject *o = (MonoObject*)(obj); \