From 7c89fec70e53d5d0b2403b7219efc86204405540 Mon Sep 17 00:00:00 2001 From: Francesco Lavra Date: Wed, 13 Dec 2023 11:02:07 +0100 Subject: [PATCH 1/6] id heap: fix alignment of allocated values The id heap has the property that allocated values are aligned to the nearest power-of-2 value equal to or greater than the allocation size. In the current code, it can happen that an allocation of a given size served by an id heap range whose start value is not aligned to the allocation size returns a non-aligned value. This patch fixes the above issue by removing the padding of the start argument in the bitmap allocation function (because the start value is already appropriately aligned by the id heap in the id_alloc_from_range() function). In order to optimize performance of bitmap allocations, bitmap ranges added to an id heap have now a start value always aligned to BITMAP_WORDLEN (even if the corresponding id heap range is not aligned), so that the start argument to bitmap_alloc_within_range() is always aligned to BITMAP_WORDLEN for allocation size values greater than (BITMAP_WORDLEN / 2). The bitmap_start field of struct id_range contains the aligned start value of a range, and is used to apply the appropriate offsets during allocations and deallocations. Since now the id heap works correctly with arbitrarily aligned ranges, we can remove the padding to 2MB of the start value of physical memory ranges, which was causing a noticeable waste of memory space in low-memory machines. --- platform/pc/service.c | 9 ++------- platform/riscv-virt/service.c | 2 +- platform/virt/service.c | 5 +---- src/runtime/bitmap.c | 19 ++++++------------- src/runtime/bitmap.h | 4 ++++ src/runtime/heap/id.c | 29 +++++++++++++++++++++-------- test/unit/id_heap_test.c | 24 ++++++++++++++++++++++++ 7 files changed, 59 insertions(+), 33 deletions(-) diff --git a/platform/pc/service.c b/platform/pc/service.c index f774749fd..9bcae0a04 100644 --- a/platform/pc/service.c +++ b/platform/pc/service.c @@ -332,15 +332,10 @@ id_heap init_physical_id_heap(heap h) early_init_debug("physical memory:"); for_regions(e) { if (e->type == REGION_PHYSICAL) { - /* Align for 2M pages */ u64 base = e->base; - u64 end = base + e->length; - u64 page2m_mask = MASK(PAGELOG_2M); - base = (base + page2m_mask) & ~page2m_mask; - end &= ~MASK(PAGELOG); - if (base >= end) + u64 length = e->length; + if (length == 0) continue; - u64 length = end - base; #ifdef INIT_DEBUG early_debug("INIT: ["); early_debug_u64(base); diff --git a/platform/riscv-virt/service.c b/platform/riscv-virt/service.c index e3bb30850..e555d846e 100644 --- a/platform/riscv-virt/service.c +++ b/platform/riscv-virt/service.c @@ -47,7 +47,7 @@ id_heap init_physical_id_heap(heap h) u64 end = PHYSMEM_BASE + mem_size; u64 bootstrap_size = init_bootstrap_heap(end - base); map(BOOTSTRAP_BASE, base, bootstrap_size, pageflags_writable(pageflags_memory())); - base = pad(base + bootstrap_size, PAGESIZE_2M); + base += bootstrap_size; init_debug("\nfree base "); init_debug_u64(base); init_debug("\nend "); diff --git a/platform/virt/service.c b/platform/virt/service.c index 0696d7fb8..aad8ed882 100644 --- a/platform/virt/service.c +++ b/platform/virt/service.c @@ -110,9 +110,6 @@ static void add_heap_range_internal(id_heap h, range r, range *remainder) r = *remainder; *remainder = tmp; } - r.start = pad(r.start, PAGESIZE_2M); - if (r.start >= r.end) - return; init_debug("adding range [0x"); init_debug_u64(r.start); init_debug(" 0x"); @@ -198,7 +195,7 @@ id_heap init_physical_id_heap(heap h) u64 end = base + get_memory_size(pointer_from_u64(DEVICETREE_BLOB_BASE)); u64 bootstrap_size = init_bootstrap_heap(end - base); map(BOOTSTRAP_BASE, base, bootstrap_size, pageflags_writable(pageflags_memory())); - base = pad(base + bootstrap_size, PAGESIZE_2M); + base += bootstrap_size; init_debug("\nfree base "); init_debug_u64(base); init_debug("\nend "); diff --git a/src/runtime/bitmap.c b/src/runtime/bitmap.c index 6308ed2d8..d4eb0f616 100644 --- a/src/runtime/bitmap.c +++ b/src/runtime/bitmap.c @@ -1,7 +1,11 @@ /* bitmap allocator Allocations are aligned to next power-of-2 equal or greater than - allocation size. This allows a simpler and faster allocation + allocation size. For allocations done with bitmap_alloc_within_range(), + the above alignment rule applies to the offset from the start argument; + for optimal performance, the start argument should be a multiple of + BITMAP_WORDLEN for allocation size values greater than (BITMAP_WORDLEN / 2). + This allows a simpler and faster allocation algorithm, and also has the side-effect of allowing mixed page sizes (e.g. 4K + 2M) within the same space. @@ -11,10 +15,6 @@ #include -#define BITMAP_WORDLEN_LOG 6 -#define BITMAP_WORDLEN (1 << BITMAP_WORDLEN_LOG) -#define BITMAP_WORDMASK (BITMAP_WORDLEN - 1) - static inline u64 * pointer_from_bit(u64 * base, u64 bit) { return base + (bit >> BITMAP_WORDLEN_LOG); @@ -121,7 +121,7 @@ static inline u64 bitmap_alloc_internal(bitmap b, u64 nbits, u64 startbit, u64 e u64 stride = U64_FROM_BIT(order); endbit = MIN(endbit, b->maxbits); - u64 bit = pad(startbit, stride); + u64 bit = startbit; if (bit + nbits > endbit) return INVALID_PHYSICAL; @@ -190,13 +190,6 @@ boolean bitmap_dealloc(bitmap b, u64 bit, u64 size) u64 * mapbase = bitmap_base(b); assert(mapbase); - /* XXX maybe error code instead of msg_err... */ - if (bit & (size - 1)) { - msg_err("bitmap %p, bit %ld is not aligned to order %ld\n", - b, bit, order); - return false; - } - if (bit + size > b->maxbits) { msg_err("bitmap %p, bit %ld, order %ld: exceeds bit length %ld\n", b, bit, order, b->maxbits); diff --git a/src/runtime/bitmap.h b/src/runtime/bitmap.h index 44d6d550f..376ced018 100644 --- a/src/runtime/bitmap.h +++ b/src/runtime/bitmap.h @@ -1,3 +1,7 @@ +#define BITMAP_WORDLEN_LOG 6 +#define BITMAP_WORDLEN (1 << BITMAP_WORDLEN_LOG) +#define BITMAP_WORDMASK (BITMAP_WORDLEN - 1) + /* XXX keep allocs small for now; rolling heap allocations more than a page are b0rked */ #define ALLOC_EXTEND_BITS U64_FROM_BIT(12) diff --git a/src/runtime/heap/id.c b/src/runtime/heap/id.c index be56d805f..6bc9530a8 100644 --- a/src/runtime/heap/id.c +++ b/src/runtime/heap/id.c @@ -19,6 +19,7 @@ typedef struct id_range { struct rmnode n; /* range in pages */ bitmap b; + u64 bitmap_start; u64 next_bit[NEXT_BIT_COUNT]; /* for next-fit search */ } *id_range; @@ -47,7 +48,7 @@ static id_range id_add_range(id_heap i, u64 base, u64 length) if (length == infinity) { length -= base; /* bitmap will round up to next 64 page boundary, don't wrap */ - length &= ~((1ull << (page_order(i) + 6)) - 1); + length &= ~((1ull << (page_order(i) + BITMAP_WORDLEN_LOG)) - 1); } /* assert only page-sized and non-zero */ @@ -67,11 +68,23 @@ static id_range id_add_range(id_heap i, u64 base, u64 length) msg_err("%s: range insertion failure; conflict with range %R\n", __func__, ir->n.r); goto fail; } - ir->b = allocate_bitmap(i->meta, i->map, pages); + + /* To optimize performance of bitmap allocations, make the bitmap range start at a + * BITMAP_WORDLEN-aligned value, so that the start argument to bitmap_alloc_within_range() is + * always aligned to BITMAP_WORDLEN for allocation size values greater than + * (BITMAP_WORDLEN / 2). */ + u64 page_start_mask = page_start & BITMAP_WORDMASK; + + ir->b = allocate_bitmap(i->meta, i->map, pages + page_start_mask); if (ir->b == INVALID_ADDRESS) { msg_err("%s: failed to allocate bitmap for range %R\n", __func__, ir->n.r); goto fail; } + if (page_start_mask) + /* Mark the initial bits (which are not part of the range supplied to this function) as + * allocated, to prevent the bitmap from returning these bits during allocations. */ + bitmap_range_check_and_set(ir->b, 0, page_start_mask, false, true); + ir->bitmap_start = page_start & ~BITMAP_WORDMASK; zero(ir->next_bit, sizeof(ir->next_bit)); i->total += length; id_debug("added range base 0x%lx, end 0x%lx (length 0x%lx)\n", base, base + length, length); @@ -109,8 +122,8 @@ static u64 id_alloc_from_range(id_heap i, id_range r, u64 pages, range subrange) /* align search start (but not end, for pages may fit in a non-power-of-2 remainder at the end of the range) and remove id_range offset */ - ri.start = pad(ri.start, pages_rounded) - r->n.r.start; - ri.end -= r->n.r.start; + ri.start = pad(ri.start, pages_rounded) - r->bitmap_start; + ri.end -= r->bitmap_start; id_debug("after adjust %R\n", ri); if (!range_valid(ri) || range_span(ri) < pages) { id_debug("range invalid %d, range_span(ri) %ld, pages %ld\n", @@ -140,7 +153,7 @@ static u64 id_alloc_from_range(id_heap i, id_range r, u64 pages, range subrange) set_next_bit(r, page_order, bit + pages_rounded); i->allocated += pages << page_order(i); - u64 result = (r->n.r.start + bit) << page_order(i); + u64 result = (r->bitmap_start + bit) << page_order(i); id_debug("allocated bit %ld, range page start %ld, returning 0x%lx\n", bit, r->n.r.start, result); return result; @@ -178,7 +191,7 @@ closure_function(2, 1, boolean, dealloc_from_range, range ri = range_intersection(q, n->r); id_range r = (id_range)n; - int bit = ri.start - n->r.start; + int bit = ri.start - r->bitmap_start; u64 pages = range_span(ri); int order = find_order(pages); if (!bitmap_dealloc(r->b, bit, pages)) { @@ -259,7 +272,7 @@ closure_function(3, 1, boolean, set_intersection, { range ri = range_intersection(bound(q), n->r); id_range r = (id_range)n; - int bit = ri.start - n->r.start; + int bit = ri.start - r->bitmap_start; if (!bitmap_range_check_and_set(r->b, bit, range_span(ri), bound(validate), bound(allocate))) return false; return true; @@ -342,7 +355,7 @@ static inline void set_next(id_heap i, bytes count, u64 next) rangemap_foreach(i->ranges, n) { id_range r = (id_range)n; if (point_in_range(r->n.r, next)) - set_next_bit(r, order, next - r->n.r.start); + set_next_bit(r, order, next - r->bitmap_start); else if (r->n.r.start > next) set_next_bit(r, order, 0); } diff --git a/test/unit/id_heap_test.c b/test/unit/id_heap_test.c index a91e7580a..f409777a4 100644 --- a/test/unit/id_heap_test.c +++ b/test/unit/id_heap_test.c @@ -361,6 +361,27 @@ static boolean alloc_subrange_test(heap h) return true; } +static boolean alloc_align_test(heap h) +{ + const u64 unaligned_base = 0x12345; + const u64 heap_size = 0x20000; + id_heap id = create_id_heap(h, h, unaligned_base, heap_size, 1, false); + if (id == INVALID_ADDRESS) { + msg_err("cannot create heap\n"); + return false; + } + for (int size = 1; pad(unaligned_base, size) + size <= unaligned_base + heap_size; size <<= 1) { + u64 i = allocate_u64((heap)id, size); + if (i != pad(unaligned_base, size)) { + msg_err("unexpected allocation 0x%lx for size 0x%x\n", i, size); + return false; + } + deallocate_u64((heap)id, i, size); + } + destroy_heap((heap)id); + return true; +} + int main(int argc, char **argv) { heap h = init_process_runtime(); @@ -383,6 +404,9 @@ int main(int argc, char **argv) if (!alloc_subrange_test(h)) goto fail; + if (!alloc_align_test(h)) + goto fail; + msg_debug("test passed\n"); exit(EXIT_SUCCESS); fail: From ba1662bbac7e9dcf4367c784c59ee50aad419fe4 Mon Sep 17 00:00:00 2001 From: Francesco Lavra Date: Thu, 14 Dec 2023 16:54:32 +0100 Subject: [PATCH 2/6] allocate_tagged_region(): add locking boolean parameter This new parameter is used to determine whether a tagged region allocator should be multithread-safe. This change, beside allowing to remove duplicated code, avoids the unnecessary instantiation of locking heap wrappers in the aarch64 code, where the tagged region allocator uses directly the locked heap and thus is always multithread-safe. Initialization of tagged region allocators is being moved from platform-specific code to kernel_runtime_init(). In addition, the management subsystem now uses locking heaps instead of non-locking heaps, so that it can be used safely at runtime. --- platform/pc/service.c | 11 ----------- platform/riscv-virt/service.c | 11 ----------- platform/virt/service.c | 11 ----------- src/aarch64/kernel_machine.c | 2 +- src/kernel/init.c | 12 ++++++++++-- src/kernel/kernel.h | 2 +- src/riscv64/kernel_machine.c | 6 ++++-- src/x86_64/kernel_machine.c | 6 ++++-- 8 files changed, 20 insertions(+), 41 deletions(-) diff --git a/platform/pc/service.c b/platform/pc/service.c index 9bcae0a04..e8da3d290 100644 --- a/platform/pc/service.c +++ b/platform/pc/service.c @@ -243,16 +243,6 @@ static void __attribute__((noinline)) init_service_new_stack() { kernel_heaps kh = get_kernel_heaps(); early_init_debug("in init_service_new_stack"); - bytes pagesize = is_low_memory_machine() ? PAGESIZE : PAGESIZE_2M; - init_integers(locking_heap_wrapper(heap_general(kh), - allocate_tagged_region(kh, tag_integer, pagesize))); - init_tuples(locking_heap_wrapper(heap_general(kh), - allocate_tagged_region(kh, tag_table_tuple, pagesize))); - init_symbols(allocate_tagged_region(kh, tag_symbol, pagesize), heap_locked(kh)); - heap vh = allocate_tagged_region(kh, tag_vector, pagesize); - init_vectors(locking_heap_wrapper(heap_general(kh), vh), heap_locked(kh)); - heap sh = allocate_tagged_region(kh, tag_string, pagesize); - init_strings(locking_heap_wrapper(heap_general(kh), sh), heap_locked(kh)); for_regions(e) { switch (e->type) { @@ -265,7 +255,6 @@ static void __attribute__((noinline)) init_service_new_stack() } } - init_management(allocate_tagged_region(kh, tag_function_tuple, pagesize), heap_general(kh)); early_init_debug("init_hwrand"); init_hwrand(); diff --git a/platform/riscv-virt/service.c b/platform/riscv-virt/service.c index e555d846e..e2e8ce95d 100644 --- a/platform/riscv-virt/service.c +++ b/platform/riscv-virt/service.c @@ -102,17 +102,6 @@ static void __attribute__((noinline)) init_service_new_stack(void) { init_debug("in init_service_new_stack\n"); kernel_heaps kh = get_kernel_heaps(); - bytes pagesize = is_low_memory_machine() ? PAGESIZE : PAGESIZE_2M; - init_integers(locking_heap_wrapper(heap_general(kh), - allocate_tagged_region(kh, tag_integer, pagesize))); - init_tuples(locking_heap_wrapper(heap_general(kh), - allocate_tagged_region(kh, tag_table_tuple, pagesize))); - init_symbols(allocate_tagged_region(kh, tag_symbol, pagesize), heap_locked(kh)); - heap vh = allocate_tagged_region(kh, tag_vector, pagesize); - init_vectors(locking_heap_wrapper(heap_general(kh), vh), heap_locked(kh)); - heap sh = allocate_tagged_region(kh, tag_string, pagesize); - init_strings(locking_heap_wrapper(heap_general(kh), sh), heap_locked(kh)); - init_management(allocate_tagged_region(kh, tag_function_tuple, pagesize), heap_general(kh)); init_debug("calling runtime init\n"); kernel_runtime_init(kh); while(1); diff --git a/platform/virt/service.c b/platform/virt/service.c index aad8ed882..5d0dc4ec9 100644 --- a/platform/virt/service.c +++ b/platform/virt/service.c @@ -244,17 +244,6 @@ static void __attribute__((noinline)) init_service_new_stack(void) { init_debug("in init_service_new_stack\n"); kernel_heaps kh = get_kernel_heaps(); - bytes pagesize = is_low_memory_machine() ? PAGESIZE : PAGESIZE_2M; - init_integers(locking_heap_wrapper(heap_general(kh), - allocate_tagged_region(kh, tag_integer, pagesize))); - init_tuples(locking_heap_wrapper(heap_general(kh), - allocate_tagged_region(kh, tag_table_tuple, pagesize))); - init_symbols(allocate_tagged_region(kh, tag_symbol, pagesize), heap_locked(kh)); - heap vh = allocate_tagged_region(kh, tag_vector, pagesize); - init_vectors(locking_heap_wrapper(heap_general(kh), vh), heap_locked(kh)); - heap sh = allocate_tagged_region(kh, tag_string, pagesize); - init_strings(locking_heap_wrapper(heap_general(kh), sh), heap_locked(kh)); - init_management(allocate_tagged_region(kh, tag_function_tuple, pagesize), heap_general(kh)); init_debug("calling runtime init\n"); kernel_runtime_init(kh); while(1); diff --git a/src/aarch64/kernel_machine.c b/src/aarch64/kernel_machine.c index 54ec46eaa..fa71b4960 100644 --- a/src/aarch64/kernel_machine.c +++ b/src/aarch64/kernel_machine.c @@ -45,7 +45,7 @@ static u64 tag_alloc(heap h, bytes s) return a; } -heap allocate_tagged_region(kernel_heaps kh, u64 tag, bytes pagesize) +heap allocate_tagged_region(kernel_heaps kh, u64 tag, bytes pagesize, boolean locking) { heap h = heap_locked(kh); struct tagheap *th = allocate(h, sizeof(struct tagheap)); diff --git a/src/kernel/init.c b/src/kernel/init.c index 2f1480b85..e7b1fbd50 100644 --- a/src/kernel/init.c +++ b/src/kernel/init.c @@ -448,8 +448,17 @@ void kernel_runtime_init(kernel_heaps kh) { heap misc = heap_general(kh); heap locked = heap_locked(kh); + boolean lowmem = is_low_memory_machine(); init_heaps = kh; + bytes pagesize = lowmem ? PAGESIZE : PAGESIZE_2M; + init_integers(allocate_tagged_region(kh, tag_integer, pagesize, true)); + init_tuples(allocate_tagged_region(kh, tag_table_tuple, pagesize, true)); + init_symbols(allocate_tagged_region(kh, tag_symbol, pagesize, false), locked); + init_vectors(allocate_tagged_region(kh, tag_vector, pagesize, true), locked); + init_strings(allocate_tagged_region(kh, tag_string, pagesize, true), locked); + init_management(allocate_tagged_region(kh, tag_function_tuple, pagesize, true), locked); + /* runtime and console init */ #ifdef INIT_DEBUG early_debug("kernel_runtime_init\n"); @@ -460,8 +469,7 @@ void kernel_runtime_init(kernel_heaps kh) dma_init(kh); list_init(&mm_cleaners); spin_lock_init(&mm_lock); - u64 memory_reserve = is_low_memory_machine() ? PAGECACHE_LOWMEM_MEMORY_RESERVE : - PAGECACHE_MEMORY_RESERVE; + u64 memory_reserve = lowmem ? PAGECACHE_LOWMEM_MEMORY_RESERVE : PAGECACHE_MEMORY_RESERVE; init_pagecache(locked, reserve_heap_wrapper(misc, (heap)heap_page_backed(kh), memory_reserve), reserve_heap_wrapper(misc, (heap)heap_physical(kh), memory_reserve), PAGESIZE); mem_cleaner pc_cleaner = closure(misc, mm_pagecache_cleaner); diff --git a/src/kernel/kernel.h b/src/kernel/kernel.h index fd00f0c27..1c187154c 100644 --- a/src/kernel/kernel.h +++ b/src/kernel/kernel.h @@ -798,7 +798,7 @@ void unmap_and_free_phys(u64 virtual, u64 length); #if !defined(BOOT) -heap allocate_tagged_region(kernel_heaps kh, u64 tag, bytes pagesize); +heap allocate_tagged_region(kernel_heaps kh, u64 tag, bytes pagesize, boolean locking); heap locking_heap_wrapper(heap meta, heap parent); #endif diff --git a/src/riscv64/kernel_machine.c b/src/riscv64/kernel_machine.c index d72034173..744489e08 100644 --- a/src/riscv64/kernel_machine.c +++ b/src/riscv64/kernel_machine.c @@ -9,7 +9,7 @@ #define tag_debug(x, ...) #endif -heap allocate_tagged_region(kernel_heaps kh, u64 tag, bytes pagesize) +heap allocate_tagged_region(kernel_heaps kh, u64 tag, bytes pagesize, boolean locking) { heap h = heap_locked(kh); heap p = (heap)heap_physical(kh); @@ -25,7 +25,9 @@ heap allocate_tagged_region(kernel_heaps kh, u64 tag, bytes pagesize) /* reserve area in virtual_huge */ assert(id_heap_set_area(heap_virtual_huge(kh), tag_base, tag_length, true, true)); - return allocate_mcache(h, backed, 5, find_order(pagesize) - 1, pagesize, false); + heap mc = allocate_mcache(h, backed, 5, find_order(pagesize) - 1, pagesize, false); + assert(mc != INVALID_ADDRESS); + return locking ? locking_heap_wrapper(h, mc) : mc; } extern void *trap_handler; diff --git a/src/x86_64/kernel_machine.c b/src/x86_64/kernel_machine.c index 2a697762a..e9e5f955d 100644 --- a/src/x86_64/kernel_machine.c +++ b/src/x86_64/kernel_machine.c @@ -36,7 +36,7 @@ void interrupt_exit(void) lapic_eoi(); } -heap allocate_tagged_region(kernel_heaps kh, u64 tag, bytes pagesize) +heap allocate_tagged_region(kernel_heaps kh, u64 tag, bytes pagesize, boolean locking) { heap h = heap_locked(kh); heap p = (heap)heap_physical(kh); @@ -52,7 +52,9 @@ heap allocate_tagged_region(kernel_heaps kh, u64 tag, bytes pagesize) /* reserve area in virtual_huge */ assert(id_heap_set_area(heap_virtual_huge(kh), tag_base, tag_length, true, true)); - return allocate_mcache(h, backed, 5, find_order(pagesize) - 1, pagesize, false); + heap mc = allocate_mcache(h, backed, 5, find_order(pagesize) - 1, pagesize, false); + assert(mc != INVALID_ADDRESS); + return locking ? locking_heap_wrapper(h, mc) : mc; } void clone_frame_pstate(context_frame dest, context_frame src) From aaddf9f79f61dda0b0a12861ac0e1ce5bc5c6bc6 Mon Sep 17 00:00:00 2001 From: Francesco Lavra Date: Thu, 14 Dec 2023 17:03:03 +0100 Subject: [PATCH 3/6] Kernel heaps: remove unsafe use of general heap The general (non-locked) kernel heap is not safe to use after the call to kernel_runtime_init(); this change replaces this heap with the locked heap in a few places where the code is executed from the scheduler runloop. --- src/kernel/klib.c | 2 +- src/net/netsyscall.c | 5 +++-- src/unix/coredump.c | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/kernel/klib.c b/src/kernel/klib.c index 9bb6ace22..d0c1e2983 100644 --- a/src/kernel/klib.c +++ b/src/kernel/klib.c @@ -296,7 +296,7 @@ void init_klib(kernel_heaps kh, void *fs, tuple config_root, status_handler comp heap h = heap_locked(kh); klib_kh = kh; klib_fs = (filesystem)fs; - klib_loaded = allocate_vector(heap_general(kh), 4); + klib_loaded = allocate_vector(h, 4); assert(klib_loaded != INVALID_ADDRESS); extern u8 END; diff --git a/src/net/netsyscall.c b/src/net/netsyscall.c index de6d5a6f0..da434f156 100755 --- a/src/net/netsyscall.c +++ b/src/net/netsyscall.c @@ -2630,12 +2630,13 @@ boolean netsyscall_init(unix_heaps uh, tuple cfg) else so_rcvbuf = DEFAULT_SO_RCVBUF; kernel_heaps kh = (kernel_heaps)uh; - caching_heap socket_cache = allocate_objcache(heap_general(kh), (heap)heap_page_backed(kh), + heap h = heap_locked(kh); + caching_heap socket_cache = allocate_objcache(h, (heap)heap_page_backed(kh), sizeof(struct netsock), PAGESIZE, true); if (socket_cache == INVALID_ADDRESS) return false; uh->socket_cache = socket_cache; - net_loop_poll = closure(heap_general(kh), netsock_poll); + net_loop_poll = closure(h, netsock_poll); netlink_init(); vsock_init(); return true; diff --git a/src/unix/coredump.c b/src/unix/coredump.c index 53cb26e94..34874ed8b 100644 --- a/src/unix/coredump.c +++ b/src/unix/coredump.c @@ -221,7 +221,7 @@ void coredump(thread t, struct siginfo *si, status_handler complete) return; } u64 doff = 0; - heap h = heap_general(get_kernel_heaps()); + heap h = heap_locked(get_kernel_heaps()); process p = t->p; status s = STATUS_OK; From d1b9901d09c0220db30188378c51be3109084577 Mon Sep 17 00:00:00 2001 From: Francesco Lavra Date: Wed, 13 Dec 2023 16:21:51 +0100 Subject: [PATCH 4/6] Tuples: add is_composite() function This function returns true if the value supplied as argument is of a composite type, i.e. it can be operated on using get(), set() and iterate(). Calls to is_tuple() and is_vector() have been refactored to use is_composite() where appropriate; this removes some duplicated code. --- klib/cloud_init.c | 2 +- klib/firewall.c | 2 +- src/drivers/console.c | 2 +- src/fs/tlog.c | 4 ++-- src/kernel/init.c | 2 +- src/kernel/tracelog.c | 2 +- src/runtime/extra_prints.c | 2 +- src/runtime/tuple.c | 16 ++++++++++++++-- src/runtime/tuple.h | 2 ++ src/unix/syscall.c | 4 ++-- 10 files changed, 26 insertions(+), 12 deletions(-) diff --git a/klib/cloud_init.c b/klib/cloud_init.c index 075a08756..31de281d7 100644 --- a/klib/cloud_init.c +++ b/klib/cloud_init.c @@ -82,7 +82,7 @@ static enum cloud cloud_detect(void) static int cloud_init_parse_vector(value config, int (*parse_each)(tuple, vector), vector tasks) { - if (!(is_tuple(config) || is_vector(config))) + if (!is_composite(config)) return KLIB_INIT_FAILED; /* allow parsing either tuple or vector for backward compatibility with older ops/tfs... */ value v; diff --git a/klib/firewall.c b/klib/firewall.c index 617f4390c..7b411bede 100644 --- a/klib/firewall.c +++ b/klib/firewall.c @@ -555,7 +555,7 @@ int init(status_handler complete) value rules = get(config, sym(rules)); if (!rules) return KLIB_INIT_OK; - if (!(is_tuple(rules) || is_vector(rules))) { + if (!is_composite(rules)) { rprintf("invalid firewall rules\n"); return KLIB_INIT_FAILED; } diff --git a/src/drivers/console.c b/src/drivers/console.c index 6236d4e76..697377f38 100644 --- a/src/drivers/console.c +++ b/src/drivers/console.c @@ -104,7 +104,7 @@ void config_console(tuple root) value v = get(root, sym(consoles)); if (v == 0) return; - if (!(is_vector(v) || is_tuple(v))) { + if (!is_composite(v)) { msg_err("consoles config is neither vector nor tuple\n"); return; } diff --git a/src/fs/tlog.c b/src/fs/tlog.c index 3a88fd090..5b1ea720b 100644 --- a/src/fs/tlog.c +++ b/src/fs/tlog.c @@ -499,7 +499,7 @@ closure_function(2, 1, void, log_switch_complete, if (is_ok(s)) table_foreach(old_tl->dictionary, k, v) { (void)v; - if ((is_tuple(k) || is_vector(k)) && !table_find(new_tl->dictionary, k)) { + if (is_composite(k) && !table_find(new_tl->dictionary, k)) { tlog_debug(" destroying value %p\n", __func__, k); destruct_value(k, false); } @@ -842,7 +842,7 @@ closure_function(4, 1, void, log_read_complete, table newdict = allocate_table(tl->h, identity_key, pointer_equal); table_foreach(tl->dictionary, k, v) { tlog_debug(" dict swap: k %p, v %p, type %d\n", k, v, tagof(v)); - assert(is_tuple(v) || is_symbol(v) || is_vector(v)); + assert(is_composite(v) || is_symbol(v)); table_set(newdict, v, k); } deallocate_table(tl->dictionary); diff --git a/src/kernel/init.c b/src/kernel/init.c index e7b1fbd50..31a18d2d7 100644 --- a/src/kernel/init.c +++ b/src/kernel/init.c @@ -613,7 +613,7 @@ static boolean vm_exit_match(u8 exit_code, tuple config, symbol option, boolean return (((action_code == exit_code) && !neq) || ((action_code != exit_code) && neq)); } - } else if (is_vector(config_option)) { + } else if (is_composite(config_option)) { for (int i = 0; get_u64(config_option, intern_u64(i), &action_code); i++) { if (action_code == exit_code) return true; diff --git a/src/kernel/tracelog.c b/src/kernel/tracelog.c index 09db7b385..437dfd368 100644 --- a/src/kernel/tracelog.c +++ b/src/kernel/tracelog.c @@ -155,7 +155,7 @@ closure_function(1, 2, boolean, match_attrs, value tv = get(bound(attrs), a); if (!tv) return false; - if (is_tuple(v) || is_vector(v)) { + if (is_composite(v)) { /* We support either a single value for this attribute or a set of acceptable values (as a tuple-encoded vector). */ boolean match = false; diff --git a/src/runtime/extra_prints.c b/src/runtime/extra_prints.c index e11bcaf0b..658786ba4 100644 --- a/src/runtime/extra_prints.c +++ b/src/runtime/extra_prints.c @@ -158,7 +158,7 @@ static boolean is_binary_buffer(buffer b) static void print_value_internal(buffer dest, value v, table *visited, s32 indent, s32 depth) { - if (is_tuple(v) || is_vector(v)) { + if (is_composite(v)) { if (!*visited) { *visited = allocate_table(transient, identity_key, pointer_equal); assert(visited != INVALID_ADDRESS); diff --git a/src/runtime/tuple.c b/src/runtime/tuple.c index 8d0258f13..09c336e5b 100644 --- a/src/runtime/tuple.c +++ b/src/runtime/tuple.c @@ -125,6 +125,18 @@ boolean iterate(value e, binding_handler h) } } +boolean is_composite(value v) +{ + switch (tagof(v)) { + case tag_table_tuple: + case tag_function_tuple: + case tag_vector: + return true; + default: + return false; + } +} + closure_function(1, 2, boolean, tuple_count_each, int *, count, value, s, value, v) @@ -227,7 +239,7 @@ closure_function(2, 2, boolean, destruct_value_each, value, v, boolean, recursive, value, s, value, v) { - if (is_tuple(v) || is_vector(v)) { + if (is_composite(v)) { if (bound(recursive)) destruct_value(v, true); } else if (v != null_value) { @@ -238,7 +250,7 @@ closure_function(2, 2, boolean, destruct_value_each, void destruct_value(value v, boolean recursive) { - if (is_tuple(v) || is_vector(v)) + if (is_composite(v)) iterate(v, stack_closure(destruct_value_each, v, recursive)); deallocate_value(v); } diff --git a/src/runtime/tuple.h b/src/runtime/tuple.h index e08c5d3e7..fb474b843 100644 --- a/src/runtime/tuple.h +++ b/src/runtime/tuple.h @@ -74,6 +74,8 @@ static inline boolean is_integer(value v) return tagof(v) == tag_integer; } +boolean is_composite(value v); + /* we're lax about typing here as these are sometimes used on alloca-wrapped buffers */ static inline boolean u64_from_value(value v, u64 *result) { diff --git a/src/unix/syscall.c b/src/unix/syscall.c index fce89b9bb..a5578dcaa 100755 --- a/src/unix/syscall.c +++ b/src/unix/syscall.c @@ -2871,7 +2871,7 @@ closure_function(1, 1, boolean, notrace_notify, value, v) { notrace_configure(bound(p), false); - if (is_tuple(v) || is_vector(v)) + if (is_composite(v)) iterate(v, stack_closure(notrace_each, bound(p), true)); return true; } @@ -2881,7 +2881,7 @@ closure_function(1, 1, boolean, tracelist_notify, value, v) { notrace_configure(bound(p), true); - if (is_tuple(v) || is_vector(v)) + if (is_composite(v)) iterate(v, stack_closure(notrace_each, bound(p), false)); return true; } From 9509c87d86b00a131af6aac30308558e6ad7fadf Mon Sep 17 00:00:00 2001 From: Francesco Lavra Date: Thu, 14 Dec 2023 10:39:28 +0100 Subject: [PATCH 5/6] tuple_notifier: add support for copy-on-write and vectors This patch enhances the tuple_notifer implementation so that it can be used with a parent vector (alternatively to a parent tuple) and adds a copy-on-write functionality that allows to operate (i.e. execute get, set or iterate operations) on a composite value (tuple or vector) without modifying the original value or any nested composite values. These new features can be used for example to modify the root configuration tuple (e.g. to add or change debug options, environment variables, or program arguments) without persisting these changes in the root filesystem. The get_root_tuple() kernel function now returns the tuple_notifier that wraps the root tuple (instead of the root tuple itself), so that configuration settings are initially retrieved from the root filesystem and can be changed at runtime without affecting the contents of the boot disk. --- src/kernel/init.c | 11 ++- src/kernel/stage3.c | 3 +- src/kernel/storage.c | 1 - src/runtime/heap/id.c | 2 +- src/runtime/heap/mcache.c | 2 +- src/runtime/heap/objcache.c | 2 +- src/runtime/heap/reserve.c | 2 +- src/runtime/management.c | 174 ++++++++++++++++++++++++++++++++++-- src/runtime/management.h | 2 +- src/unix/syscall.c | 8 +- src/unix/unix.c | 4 +- src/unix/unix_internal.h | 2 +- 12 files changed, 184 insertions(+), 29 deletions(-) diff --git a/src/kernel/init.c b/src/kernel/init.c index 31a18d2d7..0a9b33478 100644 --- a/src/kernel/init.c +++ b/src/kernel/init.c @@ -189,10 +189,10 @@ closure_function(2, 2, void, fsstarted, root_fs = fs; storage_set_root_fs(fs); - wrapped_root = tuple_notifier_wrap(filesystem_getroot(fs)); + tuple fs_root = filesystem_getroot(fs); + wrapped_root = tuple_notifier_wrap(fs_root, true); assert(wrapped_root != INVALID_ADDRESS); - // XXX use wrapped_root after root fs is separate - tuple root = filesystem_getroot(root_fs); + tuple root = (tuple)wrapped_root; tuple mounts = get_tuple(root, sym(mounts)); if (mounts) storage_set_mountpoints(mounts); @@ -227,7 +227,7 @@ closure_function(2, 2, void, fsstarted, closure_finish(); symbol booted = sym(booted); if (!get(root, booted)) - filesystem_write_eav((tfs)fs, root, booted, null_value, false); + filesystem_write_eav((tfs)fs, fs_root, booted, null_value, false); config_console(root); } @@ -331,12 +331,11 @@ filesystem get_root_fs(void) tuple get_root_tuple(void) { - return root_fs ? filesystem_getroot(root_fs) : 0; + return (tuple)wrapped_root; } void register_root_notify(symbol s, set_value_notify n) { - // XXX to be restored when root fs tuple is separated from root tuple tuple_notifier_register_set_notify(wrapped_root, s, n); } diff --git a/src/kernel/stage3.c b/src/kernel/stage3.c index 3d7fc1644..7bfb966ba 100644 --- a/src/kernel/stage3.c +++ b/src/kernel/stage3.c @@ -90,7 +90,6 @@ static void init_kernel_heaps_management(tuple root) set(heaps, sym(physical), heap_management((heap)heap_physical(kh))); set(heaps, sym(general), heap_management((heap)heap_general(kh))); set(heaps, sym(locked), heap_management((heap)heap_locked(kh))); - set(heaps, sym(no_encode), null_value); set(root, sym(heaps), heaps); } @@ -135,7 +134,7 @@ closure_function(6, 0, void, startup, filesystem_set_readonly(fs); value p = get(root, sym(program)); assert(p && is_string(p)); - tuple pro = resolve_path(root, split(general, p, '/')); + tuple pro = resolve_path(filesystem_getroot(fs), split(general, p, '/')); if (!pro) halt("unable to resolve program path \"%b\"\n", p); program_set_perms(root, pro); diff --git a/src/kernel/storage.c b/src/kernel/storage.c index 5357fd475..59d964590 100644 --- a/src/kernel/storage.c +++ b/src/kernel/storage.c @@ -40,7 +40,6 @@ static struct { static void notify_mount_change_locked(void); /* Called with mutex locked. */ -// XXX this won't work with wrapped root... static volume storage_get_volume(tuple root) { list_foreach(&storage.volumes, e) { diff --git a/src/runtime/heap/id.c b/src/runtime/heap/id.c index 6bc9530a8..5c2d9a823 100644 --- a/src/runtime/heap/id.c +++ b/src/runtime/heap/id.c @@ -453,7 +453,7 @@ static value id_management(heap h) symbol s; tuple t = timm("type", "id", "pagesize", "%d", i->h.pagesize); assert(t != INVALID_ADDRESS); - tuple_notifier n = tuple_notifier_wrap(t); + tuple_notifier n = tuple_notifier_wrap(t, false); assert(n != INVALID_ADDRESS); register_stat(i, n, t, allocated); register_stat(i, n, t, total); diff --git a/src/runtime/heap/mcache.c b/src/runtime/heap/mcache.c index 66844f9f9..82ad98b23 100644 --- a/src/runtime/heap/mcache.c +++ b/src/runtime/heap/mcache.c @@ -282,7 +282,7 @@ static value mcache_management(heap h) symbol s; tuple t = timm("type", "mcache", "pagesize", "%d", m->h.pagesize); assert(t != INVALID_ADDRESS); - tuple_notifier n = tuple_notifier_wrap(t); + tuple_notifier n = tuple_notifier_wrap(t, false); assert(n != INVALID_ADDRESS); register_stat(m, n, t, allocated); register_stat(m, n, t, total); diff --git a/src/runtime/heap/objcache.c b/src/runtime/heap/objcache.c index a0e500c49..dbd66b28e 100644 --- a/src/runtime/heap/objcache.c +++ b/src/runtime/heap/objcache.c @@ -439,7 +439,7 @@ static value objcache_management(heap h) symbol s; tuple t = timm("type", "objcache", "pagesize", "%d", object_size(o)); assert(t != INVALID_ADDRESS); - tuple_notifier n = tuple_notifier_wrap(t); + tuple_notifier n = tuple_notifier_wrap(t, false); assert(n != INVALID_ADDRESS); register_stat(o, n, t, allocated); register_stat(o, n, t, total); diff --git a/src/runtime/heap/reserve.c b/src/runtime/heap/reserve.c index e6f2d0c56..4a3a8637e 100644 --- a/src/runtime/heap/reserve.c +++ b/src/runtime/heap/reserve.c @@ -83,7 +83,7 @@ static value reservelock_management(heap h) symbol s; tuple t = timm("type", "reservelock", "pagesize", "%d", rl->h.pagesize); assert(t != INVALID_ADDRESS); - tuple_notifier n = tuple_notifier_wrap(t); + tuple_notifier n = tuple_notifier_wrap(t, false); assert(n != INVALID_ADDRESS); register_stat(rl, n, t, allocated); register_stat(rl, n, t, total); diff --git a/src/runtime/management.c b/src/runtime/management.c index 43e17d377..21b9de1f8 100644 --- a/src/runtime/management.c +++ b/src/runtime/management.c @@ -215,11 +215,29 @@ tuple allocate_function_tuple(tuple_get g, tuple_set s, tuple_iterate i) typedef struct tuple_notifier { struct function_tuple f; - tuple parent; + value parent; table get_notifys; /* get_value_notifys */ table set_notifys; /* set_value_notifys */ } *tuple_notifier; +typedef struct tuple_notifier_cow { + struct tuple_notifier tn; + value parent_copy; + struct spinlock lock; +} *tuple_notifier_cow; + +#ifdef KERNEL + +#define tn_cow_lock(tn) spin_lock(&(tn)->lock) +#define tn_cow_unlock(tn) spin_unlock(&(tn)->lock) + +#else + +#define tn_cow_lock(tn) +#define tn_cow_unlock(tn) + +#endif + /* It might be a shade more elegant to make a function-backed value to poll dynamic values rather than use a get notifier, and it would avoid the table lookup in get_notifys. There are some subtleties that need to be sorted @@ -247,6 +265,53 @@ closure_function(1, 1, value, tuple_notifier_get, return get(bound(tn)->parent, a); /* transparent */ } +static boolean tn_init_copy(tuple_notifier_cow tn) +{ + if (tn->parent_copy == INVALID_ADDRESS) { + if (is_tuple(tn->tn.parent)) + tn->parent_copy = allocate_tuple(); + else + tn->parent_copy = allocate_tagged_vector(vector_length(tn->tn.parent)); + if (tn->parent_copy == INVALID_ADDRESS) + return false; + } + return true; +} + +closure_function(1, 1, value, tuple_notifier_cow_get, + tuple_notifier_cow, tn, + value, a) +{ + symbol s; + tuple_notifier_cow tn_cow = bound(tn); + tuple_notifier tn = &tn_cow->tn; + get_value_notify n; + if (tn->get_notifys && ((s = sym_from_attribute(a)) && (n = table_find(tn->get_notifys, s)))) + return apply(n); + tn_cow_lock(tn_cow); + value v; + if (tn_cow->parent_copy != INVALID_ADDRESS) { + v = get(tn_cow->parent_copy, a); + if (v) + goto out; + } + v = get(tn->parent, a); + if (is_composite(v)) { + if (!tn_init_copy(tn_cow)) { + v = 0; + goto out; + } + v = tuple_notifier_wrap(v, true); + if (v != INVALID_ADDRESS) + set(tn_cow->parent_copy, a, v); + else + v = 0; + } + out: + tn_cow_unlock(tn_cow); + return v; +} + closure_function(1, 2, void, tuple_notifier_set, tuple_notifier, tn, value, a, value, v) @@ -262,6 +327,24 @@ closure_function(1, 2, void, tuple_notifier_set, set(bound(tn)->parent, a, v); } +closure_function(1, 2, void, tuple_notifier_cow_set, + tuple_notifier_cow, tn, + value, a, value, v) +{ + tuple_notifier_cow tn_cow = bound(tn); + tuple_notifier tn = &tn_cow->tn; + symbol s; + set_value_notify n; + if (tn->set_notifys && ((s = sym_from_attribute(a))) && (n = table_find(tn->set_notifys, s))) { + if (!apply(n, v)) + return; /* setting of value not allowed */ + } + tn_cow_lock(tn_cow); + if (tn_init_copy(tn_cow)) + set(tn_cow->parent_copy, a, v); + tn_cow_unlock(tn_cow); +} + closure_function(2, 2, boolean, tuple_notifier_iterate_each, tuple_notifier, tn, binding_handler, h, value, a, value, v) @@ -279,12 +362,71 @@ closure_function(1, 1, boolean, tuple_notifier_iterate, binding_handler, h) { /* This assumes that all attributes of interest exist in the parent - tuple. Values that are served by get_notifys should still have - corresponding entries in the parent tuple if they are to be included in + value. Values that are served by get_notifys should still have + corresponding entries in the parent value if they are to be included in an iterate. */ return iterate(bound(tn)->parent, stack_closure(tuple_notifier_iterate_each, bound(tn), h)); } +closure_function(2, 2, boolean, tuple_notifier_cow_iterate_each, + tuple_notifier_cow, tn, binding_handler, h, + value, a, value, v) +{ + tuple_notifier_cow tn_cow = bound(tn); + tuple_notifier tn = &tn_cow->tn; + symbol s = sym_from_attribute(a); + get_value_notify n; + if (tn->get_notifys && (n = table_find(tn->get_notifys, s))) { + v = apply(n); + } else { + tn_cow_lock(tn_cow); + value v_copy; + if (tn_cow->parent_copy != INVALID_ADDRESS) { + value v_copy = get(tn_cow->parent_copy, a); + if (v_copy) { + v = v_copy; + } + } else { + v_copy = 0; + } + if (!v_copy && is_composite(v)) { + if (!tn_init_copy(tn_cow)) + goto error; + v = tuple_notifier_wrap(v, true); + if (v == INVALID_ADDRESS) + goto error; + set(tn_cow->parent_copy, a, v); + } + tn_cow_unlock(tn_cow); + } + return apply(bound(h), s, v); + error: + tn_cow_unlock(tn_cow); + return false; +} + +closure_function(2, 2, boolean, tuple_notifier_iterate_copy_each, + value, parent, binding_handler, h, + value, a, value, v) +{ + if (get(bound(parent), a)) /* value has been handled in parent iterator */ + return true; + return apply(bound(h), sym_from_attribute(a), v); +} + +closure_function(1, 1, boolean, tuple_notifier_cow_iterate, + tuple_notifier_cow, tn, + binding_handler, h) +{ + tuple_notifier_cow tn = bound(tn); + value parent = tn->tn.parent; + if (!iterate(parent, stack_closure(tuple_notifier_cow_iterate_each, tn, h))) + return false; + if (tn->parent_copy == INVALID_ADDRESS) + return true; + return iterate(tn->parent_copy, stack_closure(tuple_notifier_iterate_copy_each, parent, h)); +} + closure_function(1, 0, value, tuple_notifier_wrapped, tuple_notifier, tn) { @@ -307,22 +449,36 @@ void tuple_notifier_register_set_notify(tuple_notifier tn, symbol s, set_value_n assert(tn->set_notifys != INVALID_ADDRESS); } table_set(tn->set_notifys, s, n); - value v = get(tn->parent, s); + value v = get(tn, s); if (v) apply(n, v); } -tuple_notifier tuple_notifier_wrap(tuple parent) +tuple_notifier tuple_notifier_wrap(value parent, boolean copy_on_write) { - tuple_notifier tn = allocate(management.fth, sizeof(struct tuple_notifier)); + tuple_notifier tn; + tuple_notifier_cow tn_cow; + if (copy_on_write) { + tn_cow = allocate(management.fth, sizeof(struct tuple_notifier_cow)); + tn = &tn_cow->tn; + } else { + tn = allocate(management.fth, sizeof(struct tuple_notifier)); + } if (tn == INVALID_ADDRESS) return tn; tn->parent = parent; tn->get_notifys = 0; tn->set_notifys = 0; - tn->f.g = closure(management.h, tuple_notifier_get, tn); - tn->f.s = closure(management.h, tuple_notifier_set, tn); - tn->f.i = closure(management.h, tuple_notifier_iterate, tn); + tn->f.g = copy_on_write ? closure(management.h, tuple_notifier_cow_get, tn_cow) : + closure(management.h, tuple_notifier_get, tn); + tn->f.s = copy_on_write ? closure(management.h, tuple_notifier_cow_set, tn_cow) : + closure(management.h, tuple_notifier_set, tn); + tn->f.i = copy_on_write ? closure(management.h, tuple_notifier_cow_iterate, tn_cow) : + closure(management.h, tuple_notifier_iterate, tn); + if (copy_on_write) { + tn_cow->parent_copy = INVALID_ADDRESS; + spin_lock_init(&tn_cow->lock); + } /* The special /wrapped attribute is probed by print_value and friends. Since it's not in the parent tuple, it won't show up in an iterate; it's hidden. */ diff --git a/src/runtime/management.h b/src/runtime/management.h index a4d552871..49daacace 100644 --- a/src/runtime/management.h +++ b/src/runtime/management.h @@ -9,7 +9,7 @@ typedef struct tuple_notifier *tuple_notifier; typedef closure_type(set_value_notify, boolean, value); typedef closure_type(get_value_notify, value); -tuple_notifier tuple_notifier_wrap(tuple parent); +tuple_notifier tuple_notifier_wrap(value parent, boolean copy_on_write); void tuple_notifier_unwrap(tuple_notifier tn); void tuple_notifier_register_get_notify(tuple_notifier tn, symbol s, get_value_notify n); void tuple_notifier_register_set_notify(tuple_notifier tn, symbol s, set_value_notify n); diff --git a/src/unix/syscall.c b/src/unix/syscall.c index a5578dcaa..c35993f7a 100755 --- a/src/unix/syscall.c +++ b/src/unix/syscall.c @@ -2753,18 +2753,20 @@ closure_function(0, 2, void, print_missing_files_cfn, rprintf("missing_files_end\n"); } -void init_syscalls(tuple root) +void init_syscalls(process p) { heap h = heap_locked(get_kernel_heaps()); syscall = syscall_handler; syscall_io_complete = closure(h, syscall_io_complete_cfn); io_completion_ignore = closure(h, io_complete_ignore); + filesystem fs = p->root_fs; vector hostname_v = split(h, alloca_wrap_cstring("etc/hostname"), '/'); - tuple hostname_t = resolve_path(root, hostname_v); + tuple hostname_t = resolve_path(filesystem_getroot(fs), hostname_v); split_dealloc(hostname_v); if (hostname_t) - filesystem_read_entire(get_root_fs(), hostname_t, h, + filesystem_read_entire(fs, hostname_t, h, closure(h, hostname_done), ignore_status); + tuple root = p->process_root; do_syscall_stats = get(root, sym(syscall_summary)) != 0; if (do_syscall_stats) { print_syscall_stats = closure(h, print_syscall_stats_cfn); diff --git a/src/unix/unix.c b/src/unix/unix.c index 99686b62b..d7d674b9b 100644 --- a/src/unix/unix.c +++ b/src/unix/unix.c @@ -533,7 +533,7 @@ process create_process(unix_heaps uh, tuple root, filesystem fs) } filesystem_reserve(fs); /* because it hosts the current working directory */ p->root_fs = p->cwd_fs = fs; - p->cwd = fs->get_inode(fs, root); + p->cwd = fs->get_inode(fs, filesystem_getroot(fs)); p->process_root = root; p->fdallocator = create_id_heap(locked, locked, 0, infinity, 1, false); p->files = allocate_vector(locked, 64); @@ -658,7 +658,7 @@ process init_unix(kernel_heaps kh, tuple root, filesystem fs) ftrace_enable(); register_special_files(kernel_process); - init_syscalls(kernel_process->process_root); + init_syscalls(kernel_process); register_file_syscalls(linux_syscalls); #ifdef NET register_net_syscalls(linux_syscalls); diff --git a/src/unix/unix_internal.h b/src/unix/unix_internal.h index 4296a7aa2..b2301e087 100644 --- a/src/unix/unix_internal.h +++ b/src/unix/unix_internal.h @@ -1021,7 +1021,7 @@ static inline u64 iov_total_len(struct iovec *iov, int iovcnt) #define resolve_fd(__p, __fd) ({void *f ; if (!(f = fdesc_get(__p, __fd))) return set_syscall_error(current, EBADF); f;}) -void init_syscalls(tuple root); +void init_syscalls(process p); void init_threads(process p); void init_futices(process p); From efd391bed5dec665e1fa980961350059342ab4b0 Mon Sep 17 00:00:00 2001 From: Francesco Lavra Date: Wed, 13 Dec 2023 11:56:21 +0100 Subject: [PATCH 6/6] Command line: add support for arbitrary configuration settings This patch adds support for changing arbitrary settings in the root tuple via the kernel command line (which is retrieved by the kernel when booting under AWS Firecracker on x86). The PC platform code has been amended by adding a new REGION_CMDLINE region type holding the location and size of the command line, which can be accessed later in the boot process. The existing parsing of the virtio_mmio command line options has been moved to kernel/init.c, so that it can be called by virtio_mmio_enum_devs() instead of platform-specific code. The only platform-specific code is the code that retrieves (and potentially updates) the command line, while the parsing implementation is in kernel/init.c. With these changes, it is possible for example to override the network settings when starting an instance from a given image (without modifying the image itself) by specifying those settings in the kernel command line ("boot_args" parameter in the Firecracker configuration file), as in the following example: "en1.ipaddr=10.3.3.6 en1.netmask=255.255.0.0 en1.gateway=10.3.0.1". In the above example, "en1" identifies the first network interface; if multiple interfaces are used (en2, en3, etc.), each of them can be configured independently. Example to configure a static IPv6 address on the first network interface: "en1.ip6addr=20::A8FC:FF:7600:AA" Example to add an environment variable (or override its value if the variable is already present in the image): "environment.VAR_NAME=VAR_VALUE" Example to modify the program arguments: "arguments.0=/bin/my-program arguments.1=--my-option" Closes #1976 --- platform/pc/service.c | 43 +++++++-------- src/aarch64/kernel_machine.h | 3 + src/kernel/init.c | 104 ++++++++++++++++++++++++++++++++++- src/kernel/kernel.h | 5 ++ src/kernel/region.h | 1 + src/riscv64/kernel_machine.h | 3 + src/runtime/memops.c | 8 +++ src/runtime/runtime.h | 1 + src/virtio/virtio.h | 1 - src/virtio/virtio_mmio.c | 7 ++- src/x86_64/kernel_machine.h | 3 + 11 files changed, 153 insertions(+), 26 deletions(-) diff --git a/platform/pc/service.c b/platform/pc/service.c index e8da3d290..58cfa5c52 100644 --- a/platform/pc/service.c +++ b/platform/pc/service.c @@ -389,26 +389,6 @@ static inline void jump_to_virtual(void) { 1: \n"); } -static void cmdline_parse(const char *cmdline) -{ - early_init_debug("parsing cmdline"); - const char *opt_end, *prefix_end; - while (*cmdline) { - opt_end = runtime_strchr(cmdline, ' '); - if (!opt_end) - opt_end = cmdline + runtime_strlen(cmdline); - prefix_end = runtime_strchr(cmdline, '.'); - if (prefix_end && (prefix_end < opt_end)) { - int prefix_len = prefix_end - cmdline; - if ((prefix_len == sizeof("virtio_mmio") - 1) && - !runtime_memcmp(cmdline, "virtio_mmio", prefix_len)) - virtio_mmio_parse(get_kernel_heaps(), prefix_end + 1, - opt_end - (prefix_end + 1)); - } - cmdline = opt_end + 1; - } -} - extern void *READONLY_END; /* Returns the number of pages have been allocated from the temporary page table region. */ @@ -479,7 +459,6 @@ void init_service(u64 rdi, u64 rsi) */ assert(u64_from_pointer(params) + cmdline_size < MBR_ADDRESS); runtime_memcpy(params, cmdline, cmdline_size); - params[cmdline_size] = '\0'; cmdline = (char *)params; } @@ -496,7 +475,7 @@ void init_service(u64 rdi, u64 rsi) init_page_initial_map(pointer_from_u64(PAGES_BASE), initial_pages); init_kernel_heaps(); if (cmdline) - cmdline_parse(cmdline); + create_region(u64_from_pointer(cmdline), cmdline_size, REGION_CMDLINE); u64 stack_size = 32*PAGESIZE; u64 stack_location = allocate_u64((heap)heap_page_backed(get_kernel_heaps()), stack_size); stack_location += stack_size - STACK_ALIGNMENT; @@ -607,3 +586,23 @@ void detect_devices(kernel_heaps kh, storage_attach sa) init_virtio_balloon(kh); init_virtio_rng(kh); } + +void cmdline_consume(const char *opt_name, cmdline_handler h) +{ + for_regions(e) { + if (e->type == REGION_CMDLINE) { + e->length = cmdline_parse(pointer_from_u64(e->base), e->length, opt_name, h); + return; + } + } +} + +void boot_params_apply(tuple t) +{ + for_regions(e) { + if (e->type == REGION_CMDLINE) { + cmdline_apply(pointer_from_u64(e->base), e->length, t); + return; + } + } +} diff --git a/src/aarch64/kernel_machine.h b/src/aarch64/kernel_machine.h index 23425decf..8f7da0361 100644 --- a/src/aarch64/kernel_machine.h +++ b/src/aarch64/kernel_machine.h @@ -205,6 +205,9 @@ static inline void wait_for_interrupt(void) enable_interrupts(); } +#define cmdline_consume(o, h) (void)(h) +#define boot_params_apply(t) + /* locking constructs */ #include diff --git a/src/kernel/init.c b/src/kernel/init.c index 0a9b33478..1422387c2 100644 --- a/src/kernel/init.c +++ b/src/kernel/init.c @@ -193,6 +193,8 @@ closure_function(2, 2, void, fsstarted, wrapped_root = tuple_notifier_wrap(fs_root, true); assert(wrapped_root != INVALID_ADDRESS); tuple root = (tuple)wrapped_root; + boot_params_apply(root); + reclaim_regions(); /* for pc: no accessing regions after this point */ tuple mounts = get_tuple(root, sym(mounts)); if (mounts) storage_set_mountpoints(mounts); @@ -231,6 +233,107 @@ closure_function(2, 2, void, fsstarted, config_console(root); } +static char *cmdline_next_option(char *cmdline_start, int cmdline_len, int *opt_len) +{ + while (cmdline_len > 0) { + if (*cmdline_start == ' ') { + cmdline_start++; + cmdline_len--; + } else { + break; + } + } + char *opt_end = runtime_memchr(cmdline_start, ' ', cmdline_len); + if (!opt_end) + opt_end = cmdline_start + cmdline_len; + if (opt_end > cmdline_start) { + *opt_len = opt_end - cmdline_start; + return cmdline_start; + } + return 0; +} + +static int cmdline_get_prefix(char *opt_start, int opt_len) +{ + char *prefix_end = runtime_memchr(opt_start, '.', opt_len); + return (prefix_end ? (prefix_end - opt_start) : 0); +} + +/* Removes consumed options from command line; returns updated command line length. */ +int cmdline_parse(char *cmdline_start, int cmdline_len, const char *opt_name, cmdline_handler h) +{ + init_debug("%s (%d): option %s", __func__, cmdline_len, opt_name); + char *cmdline_end = cmdline_start + cmdline_len; + char *opt_start; + int opt_len; + int name_len = runtime_strlen(opt_name); + char *p = cmdline_start; + while ((opt_start = cmdline_next_option(p, cmdline_end - p, &opt_len))) { + char *opt_end = opt_start + opt_len; + int prefix_len = cmdline_get_prefix(opt_start, opt_len); + if ((prefix_len == name_len) && !runtime_memcmp(opt_start, opt_name, prefix_len)) { + char *value_start = opt_start + prefix_len + 1; + apply(h, value_start, opt_end - value_start); + + /* consume parsed option */ + runtime_memcpy(opt_start, opt_end, cmdline_end - opt_end); + cmdline_len -= opt_len; + cmdline_end -= opt_len; + p = opt_start; + } else { + p = opt_end; + } + } + init_debug("%s: updated length %d", __func__, cmdline_len); + return cmdline_len; +} + +static void cmdline_apply_option(char *opt_start, int opt_len, value cfg) +{ + char *name_end = runtime_memchr(opt_start, '=', opt_len); + if (!name_end) + return; + char *opt_end = opt_start + opt_len; + int prefix_len = cmdline_get_prefix(opt_start, opt_len); + if (prefix_len && (prefix_len < name_end - opt_start)) { + symbol s = intern(alloca_wrap_buffer(opt_start, prefix_len)); + value child = get(cfg, s); + if (!child) { + child = allocate_tuple(); + assert(child != INVALID_ADDRESS); + set(cfg, s, child); + } + if (is_composite(child)) { + char *value_start = opt_start + prefix_len + 1; + cmdline_apply_option(value_start, opt_end - value_start, child); + } + } else { + char *value_start = name_end + 1; + symbol name = intern(alloca_wrap_buffer(opt_start, name_end - opt_start)); + string val; + if (value_start < opt_end) { + val = allocate_string(opt_end - value_start); + assert(val != INVALID_ADDRESS); + buffer_append(val, value_start, opt_end - value_start); + } else { + val = 0; + } + set(cfg, name, val); + } +} + +void cmdline_apply(char *cmdline_start, int cmdline_len, tuple t) +{ + char *opt_start; + int opt_len; + while ((opt_start = cmdline_next_option(cmdline_start, cmdline_len, &opt_len))) { + cmdline_apply_option(opt_start, opt_len, t); + char *opt_end = opt_start + opt_len; + cmdline_len -= opt_end - cmdline_start; + cmdline_start = opt_end; + } +} + #ifdef MM_DEBUG #define mm_debug(x, ...) do {tprintf(sym(mm), 0, x, ##__VA_ARGS__);} while(0) #else @@ -481,7 +584,6 @@ void kernel_runtime_init(kernel_heaps kh) init_platform_devices(kh); init_symtab(kh); read_kernel_syms(); - reclaim_regions(); /* for pc: no accessing regions after this point */ shutdown_completions = allocate_vector(locked, SHUTDOWN_COMPLETIONS_SIZE); init_debug("init_kernel_contexts"); diff --git a/src/kernel/kernel.h b/src/kernel/kernel.h index 1c187154c..9a915d559 100644 --- a/src/kernel/kernel.h +++ b/src/kernel/kernel.h @@ -5,6 +5,8 @@ #include #endif +typedef closure_type(cmdline_handler, void, const char *, int); + #ifdef KERNEL void runloop_target(void) __attribute__((noreturn)); #endif @@ -751,6 +753,9 @@ void kernel_runtime_init(kernel_heaps kh); void read_kernel_syms(void); void reclaim_regions(void); +int cmdline_parse(char *cmdline_start, int cmdline_len, const char *opt_name, cmdline_handler h); +void cmdline_apply(char *cmdline_start, int cmdline_len, tuple t); + boolean breakpoint_insert(heap h, u64 a, u8 type, u8 length, thunk completion); boolean breakpoint_remove(heap h, u32 a, thunk completion); void destruct_context(context c); diff --git a/src/kernel/region.h b/src/kernel/region.h index b7a7b90c3..e8c9bfc9c 100644 --- a/src/kernel/region.h +++ b/src/kernel/region.h @@ -13,6 +13,7 @@ typedef struct region *region; #define REGION_PHYSICAL 1 /* available physical memory */ #define REGION_DEVICE 2 /* e820 physical region configured for i/o */ #define REGION_INITIAL_PAGES 10 /* for page table allocations in stage2 and early stage3 */ +#define REGION_CMDLINE 11 /* kernel command line */ #define REGION_FILESYSTEM 12 /* offset on disk for the filesystem, see if we can get disk info from the bios */ #define REGION_KERNIMAGE 13 /* location of kernel elf image loaded by stage2 */ #define REGION_RECLAIM 14 /* areas to be unmapped and reclaimed in stage3 (only stage2 stack presently) */ diff --git a/src/riscv64/kernel_machine.h b/src/riscv64/kernel_machine.h index 4b2f6b123..2209e1a07 100644 --- a/src/riscv64/kernel_machine.h +++ b/src/riscv64/kernel_machine.h @@ -135,6 +135,9 @@ static inline void wait_for_interrupt(void) disable_interrupts(); } +#define cmdline_consume(o, h) (void)(h) +#define boot_params_apply(t) + /* locking constructs */ #include diff --git a/src/runtime/memops.c b/src/runtime/memops.c index 1b8885e45..e22d4f3ab 100644 --- a/src/runtime/memops.c +++ b/src/runtime/memops.c @@ -204,3 +204,11 @@ int runtime_memcmp(const void *a, const void *b, bytes len) } return memcmp_8(a + len - end_len, p_long_b, end_len); } + +void *runtime_memchr(const void *a, int c, bytes len) +{ + for (const char *p = a; len > 0; p++, len--) + if (*p == c) + return (void *)p; + return 0; +} diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h index e282fdd17..e8ba63ee2 100644 --- a/src/runtime/runtime.h +++ b/src/runtime/runtime.h @@ -87,6 +87,7 @@ void runtime_memcpy(void *a, const void *b, bytes len); void runtime_memset(u8 *a, u8 b, bytes len); int runtime_memcmp(const void *a, const void *b, bytes len); +void *runtime_memchr(const void *a, int c, bytes len); static inline int runtime_strlen(const char *a) { diff --git a/src/virtio/virtio.h b/src/virtio/virtio.h index 7039e7d61..f1031b262 100644 --- a/src/virtio/virtio.h +++ b/src/virtio/virtio.h @@ -6,5 +6,4 @@ void init_virtio_rng(kernel_heaps kh); void init_virtio_scsi(kernel_heaps kh, storage_attach a); void init_virtio_socket(kernel_heaps kh); -void virtio_mmio_parse(kernel_heaps kh, const char *str, int len); void virtio_mmio_enum_devs(kernel_heaps kh); diff --git a/src/virtio/virtio_mmio.c b/src/virtio/virtio_mmio.c index f53925193..87c0a5837 100644 --- a/src/virtio/virtio_mmio.c +++ b/src/virtio/virtio_mmio.c @@ -51,7 +51,9 @@ closure_function(1, 1, void, vtmmio_new_dev, list_push_back(&vtmmio_devices, &dev->l); } -void virtio_mmio_parse(kernel_heaps kh, const char *str, int len) +closure_function(1, 2, void, vtmmio_cmdline_parse, + kernel_heaps, kh, + const char *, str, int, len) { buffer b = alloca_wrap_buffer(str, len); int optname_len = buffer_strchr(b, '='); @@ -86,12 +88,13 @@ void virtio_mmio_parse(kernel_heaps kh, const char *str, int len) .memsize = memsize, .irq = irq, }; - apply(stack_closure(vtmmio_new_dev, kh), &adev); + apply(stack_closure(vtmmio_new_dev, bound(kh)), &adev); } } void virtio_mmio_enum_devs(kernel_heaps kh) { + cmdline_consume("virtio_mmio", stack_closure(vtmmio_cmdline_parse, kh)); acpi_get_vtmmio_devs(stack_closure(vtmmio_new_dev, kh)); } diff --git a/src/x86_64/kernel_machine.h b/src/x86_64/kernel_machine.h index 88207b7f2..b0d3b6325 100644 --- a/src/x86_64/kernel_machine.h +++ b/src/x86_64/kernel_machine.h @@ -170,6 +170,9 @@ void install_gdt64_and_tss(void *tss_desc, void *tss, void *gdt, void *gdt_point #ifdef KERNEL /* locking constructs */ #include + +void cmdline_consume(const char *opt_name, cmdline_handler h); +void boot_params_apply(tuple t); #endif /* device mmio region access */