diff --git a/docs/book.toml b/docs/book.toml new file mode 100644 index 0000000..95e32d1 --- /dev/null +++ b/docs/book.toml @@ -0,0 +1,16 @@ +[book] +authors = ["The Managarm Project"] +language = "en" +multilingual = false +src = "src" +title = "libarch documentation" + +[output.html] +site-url = "/libarch/" +git-repository-url = "https://github.com/managarm/libarch" +mathjax-support = true + +[output.html.search] +enable = true +limit-results = 30 +use-boolean-and = true diff --git a/docs/src/SUMMARY.md b/docs/src/SUMMARY.md new file mode 100644 index 0000000..d6f8570 --- /dev/null +++ b/docs/src/SUMMARY.md @@ -0,0 +1,4 @@ +# Summary + +[libarch](index.md) +- [Memory ordering](memory-order.md) diff --git a/docs/src/index.md b/docs/src/index.md new file mode 100644 index 0000000..f7e70e9 --- /dev/null +++ b/docs/src/index.md @@ -0,0 +1,14 @@ +# libarch + +libarch is a library to abstract over architecture specifics when accessing hardware (e.g., as part of device drivers). +Among other things, it provides: +- helpers to deal with bitfields, +- interfaces for DMA buffer allocation, +- DMA and MMIO barriers, +- endianness conversion. + +Documentation permalink: + +## Projects using libarch + + - [Managarm](https://managarm.org) - Pragmatic microkernel-based OS with fully asynchronous I/O. diff --git a/docs/src/memory-order.md b/docs/src/memory-order.md new file mode 100644 index 0000000..90b57c4 --- /dev/null +++ b/docs/src/memory-order.md @@ -0,0 +1,170 @@ +# Memory ordering + +## `arch::io_mem_space` + +`arch::io_mem_space` is intended to be used with I/O memory mappings (i.e., MMIO). +**If `arch::io_mem_space` is used with other memory (e.g., main memory), +the ordering guarantees that libarch provides may be weaker than stated below.** + +### `load()` and `store()` + +The `load()` and `store()` methods are intended to be suitable for most MMIO accesses that need to +be done in device drivers. libarch guarantees the following constraints for `load()` and `store()`: +1. All main memory accesses that occur _after_ `load()` in program order + are ordered _after_ the `load()` access + for all observers (i.e., all CPUs and devices) + which have coherent visibility of both memory acceses. +2. All main memory accesses that occur _before_ `store()` in program order + are ordered _before_ the `store()` access + for all observers (i.e., all CPUs and devices) + which have coherent visibility of both memory acceses. +3. `load()` and `store()` respect the I/O memory ordering of the underlying + memory mapping. In particular, if the underlying mapping does not permit + reordering, then `load()` and `store()` calls to the same device memory region are strongly ordered, + i.e., all observers observe all `load()` and `store()` accesses to the same device + memory region in the same order. + It is architecture-, bus-, and device-specific what a memory region is. +4. `load()` interacts as expected (i.e., as if it was a main memory read) + with preceding load-acquire reads from main memory. + Likewise, `store()` interacts as expected (i.e., as if it was a main memory write) + with following store-release writes to main memory. + +Here, "coherent visbility" means that either the device is cache coherent with the +CPU that performs the `load()` or `store()`, or that appropriate cache clean and/or invalidate +operations are done. + +**Implications** of these constraints are: +- Since constraint 1 orders main memory reads that follow `load()` in program order, + it is safe to read data written by a device-to-host transfer + from a DMA buffer after reading a device status register. +- Since constraint 2 orders main memory writes that precede `store()` in program order, + it is safe to write a device doorbell register + after filling the DMA buffers of a host-to-device transfer. +- The fact that constraint 1 also orders main memory writes that follow `load()` in program order + helps to prevent some surprises. For example, some devices require load accesses + to clear status conditions. + Likewise, since constraint 2 also orders main memory reads that precede store() + in program order, + reusing a DMA buffer for device-to-host transfers after reading from the buffer + does not require additional barriers. +- Due to constraint 4, `load()` and `store()` cannot be moved out of (or into) mutexes. + +**Caveats.** Note that constraints 1 and 2 only order `load()` and `store()` relative to accesses +to main memory. They do not affect the order of `load()` and `store()` relative to each other. +Some examples of access patterns that are not covered by the guarantees above are: +- Main memory accesses before `load()` or after `store()`: + - A read from main memory followed by a `load()` to device memory + (unless the read is a load-acquire). + - A write to main memory followed by `load()` of device memory. + - A `store()` to device memory followed by a read from main memory. + - A `store()` of device memory followed by a write to main memory + (unless the write is a store-release). +- `load()` or `store()` to a device A followed by `load()` or `store()` to a another device B. +- Ordering of `load()` against preceding store-release to main memory. + Likewise, ordering of `store()` against following load-acquire from main memory. + +**Implementation.** +The following table depicts the architecture-specific barriers that are required +to guarantee the constraints of `load()` and `store()`: + +|Architecture|Method|Before access|After access| +|---|---|---|---| +|Aarch64|`load()`||`dmb oshld`¹| +|Aarch64|`store()`|`dmb osh`¹| +|RISC-V|`load()`|`fence r, i`²|`fence i, rw`| +|RISC-V|`store()`|`fence rw, o`|`fence o, w`²| + +¹ `dmb oshld` is enough to implement `read()` since it orders reads vs. reads and writes. +On the other hand, `store()` requires `dmb osh` +since `dmb oshst` only orders writes vs. writes. + +² `fence r, i` is required to order the `load()` access vs. earlier load-acquire on main memory. +Likewise, `fence, o, w` is required to order the `store()` access +vs. future store-release on main memory. + +**Rationale.** +An alternative model could weaken the constraints to only require ordering +of main memory reads in constraint 1 and ordering of main memory writes in constraint 2. +We opt for the stronger model instead since it can prevent surprising behavior +and we consider the extra barriers to be insignificant compared to the overall +costs of MMIO accesses. In fact, on RISC-V, there is no difference in required barriers. +On Aarch64, the only difference is the use of the +stronger `dmb osh` for `store()` compared to the weaker `dmb oshst` +(with no difference in the implementation of `load()`). + +### `load_relaxed()` and `store_relaxed()` + +`load_relaxed()` and `store_relaxed()` methods are intended for situations in +which ordering of device access and main memory access is either not relevant +or ensured by explicit barriers. + +Out of the properties that libarch guarantees for `load()` and `store()`, +only constraint 3 (i.e., that the accesses respect the underlying I/O memory ordering) +is guaranteed for `load_relaxed()` and `store_relaxed()`. + +**Caveats.** Note that this means that `load_relaxed()` and `store_relaxed()` +may be moved out of (or into) mutexes if no extra barriers are used. + +## `arch::main_mem_space` + +`arch::main_mem_space` is intended to be used with main memory mappings +that are accessed both from devices and from CPUs. +For example, `arch::main_mem_space` is appropriate for descriptor +rings in main memory that are accessed by devices. +**If `arch::main_mem_space` is used with memory other than main memory, +(e.g., device memory) the ordering guarantees may be weaker than stated below.** + +### `load()` and `store()` + +1. All main memory accesses that occur _after_ `load()` in program order + are ordered _after_ the `load()` access + for all observers (i.e., all CPUs and devices) + which have coherent visibility of both memory acceses. +2. All main memory accesses that occur _before_ `store()` in program order + are ordered _before_ the `store()` access + for all observers (i.e., all CPUs and devices) + which have coherent visibility of both memory acceses. +3. `load()` and `store()` interact as usual memory access with the C++11 memory model. + +See `arch::io_mem_space` for **implications**, **caveats** and **rationale** of these +constraints. + +**Implementation.** +The following table depicts the architecture-specific barriers that are required +to guarantee the constraints of `load()` and `store()`: + +|Architecture|Method|Before access|After access| +|---|---|---|---| +|Aarch64|`load()`||`dmb ishld`¹| +|Aarch64|`store()`|`dmb ish`¹| +|RISC-V|`load()`|²|`fence r, rw`| +|RISC-V|`store()`|`fence rw, w`|²| + +¹ See explanation for `arch::io_mem_space`. +Also note that inner shareable barriers are enough for main memory accesses. + +² In contrast to `arch::io_mem_space`, we do not need barriers here +since load-acquire and store-release will already be ordered correctly +relative to main memory accesses. + +## `arch::mem_space` + +`arch::mem_space` simultaneously provides the guarantees of `arch::io_mem_space` +and `arch::main_mem_space`. **Its use is discouraged unless the underlying memory region +may be mapped as either device memory or as main memory.** + +**Implementation.** +The following table depicts the architecture-specific barriers that are required +to guarantee the constraints of `load()` and `store()`: + +|Architecture|Method|Before access|After access| +|---|---|---|---| +|Aarch64|`load()`||`dmb oshld`¹| +|Aarch64|`store()`|`dmb osh`¹| +|RISC-V|`load()`|`fence r, i`²|`fence ir, rw`³| +|RISC-V|`store()`|`fence rw, ow`³|`fence o, w`²| + +¹ ² See explanation for `arch::io_mem_space`. + +³ This barrier needs to be strong enough to order +`load()` and `store()` to both main memory and I/O memory. diff --git a/include/arch/aarch64/mem_space.hpp b/include/arch/aarch64/mem_space.hpp index fa49ede..010154d 100644 --- a/include/arch/aarch64/mem_space.hpp +++ b/include/arch/aarch64/mem_space.hpp @@ -168,6 +168,34 @@ namespace _detail { }; } +template +struct io_mem_ops { + static B load(const B *p) { + auto v = _detail::mem_ops::load_relaxed(p); + asm volatile("dmb oshld" ::: "memory"); + return v; + } + + static void store(B *p, B v) { + asm volatile("dmb osh" ::: "memory"); + _detail::mem_ops::store_relaxed(p, v); + } +}; + +template +struct main_mem_ops { + static B load(const B *p) { + auto v = _detail::mem_ops::load_relaxed(p); + asm volatile("dmb ishld" ::: "memory"); + return v; + } + + static void store(B *p, B v) { + asm volatile("dmb ish" ::: "memory"); + _detail::mem_ops::store_relaxed(p, v); + } +}; + using _detail::mem_ops; } // namespace arch diff --git a/include/arch/arm/mem_space.hpp b/include/arch/arm/mem_space.hpp index bf11e47..ebb3205 100644 --- a/include/arch/arm/mem_space.hpp +++ b/include/arch/arm/mem_space.hpp @@ -123,6 +123,14 @@ namespace _detail { }; } +// TODO: This is not correct. +template +using io_mem_ops = mem_ops; + +// TODO: This is not correct. +template +using main_mem_ops = mem_ops; + using _detail::mem_ops; } // namespace arch diff --git a/include/arch/mem_space.hpp b/include/arch/mem_space.hpp index a44d324..964bb3b 100644 --- a/include/arch/mem_space.hpp +++ b/include/arch/mem_space.hpp @@ -14,31 +14,34 @@ namespace arch { -struct mem_space { - constexpr mem_space() +namespace _details { + +template typename Ops> +struct base_mem_space { + constexpr base_mem_space() : _base(0) { } - constexpr mem_space(uintptr_t base) + constexpr base_mem_space(uintptr_t base) : _base(base) { } - mem_space(void *base) + base_mem_space(void *base) : _base(reinterpret_cast(base)) { } - mem_space subspace(ptrdiff_t offset) const { - return mem_space(reinterpret_cast(_base + offset)); + base_mem_space subspace(ptrdiff_t offset) const { + return base_mem_space(reinterpret_cast(_base + offset)); } template void store(RT r, typename RT::rep_type value) const { auto p = reinterpret_cast(_base + r.offset()); auto v = static_cast(value); - mem_ops::store(p, v); + Ops::store(p, v); } template typename RT::rep_type load(RT r) const { auto p = reinterpret_cast(_base + r.offset()); - auto b = mem_ops::load(p); + auto b = Ops::load(p); return static_cast(b); } @@ -46,13 +49,13 @@ struct mem_space { void store_relaxed(RT r, typename RT::rep_type value) const { auto p = reinterpret_cast(_base + r.offset()); auto v = static_cast(value); - mem_ops::store_relaxed(p, v); + Ops::store_relaxed(p, v); } template typename RT::rep_type load_relaxed(RT r) const { auto p = reinterpret_cast(_base + r.offset()); - auto b = mem_ops::load_relaxed(p); + auto b = Ops::load_relaxed(p); return static_cast(b); } @@ -60,6 +63,12 @@ struct mem_space { uintptr_t _base; }; +} // namespace _details + +using io_mem_space = _details::base_mem_space; +using main_mem_space = _details::base_mem_space; +using mem_space = _details::base_mem_space; + static constexpr mem_space global_mem{}; } // namespace arch diff --git a/include/arch/riscv64/mem_space.hpp b/include/arch/riscv64/mem_space.hpp index 68c8dd0..3c9a432 100644 --- a/include/arch/riscv64/mem_space.hpp +++ b/include/arch/riscv64/mem_space.hpp @@ -91,6 +91,36 @@ namespace _detail { }; } +template +struct io_mem_ops { + static B load(const B *p) { + asm volatile("fence r, i" ::: "memory"); + auto v = _detail::mem_ops::load_relaxed(p); + asm volatile("fence i, rw" ::: "memory"); + return v; + } + + static void store(B *p, B v) { + asm volatile("fence rw, o" ::: "memory"); + _detail::mem_ops::store_relaxed(p, v); + asm volatile("fence o, w" ::: "memory"); + } +}; + +template +struct main_mem_ops { + static B load(const B *p) { + auto v = _detail::mem_ops::load_relaxed(p); + asm volatile("fence r, rw" ::: "memory"); + return v; + } + + static void store(B *p, B v) { + asm volatile("fence rw, w" ::: "memory"); + _detail::mem_ops::store_relaxed(p, v); + } +}; + using _detail::mem_ops; } // namespace arch diff --git a/include/arch/x86/mem_space.hpp b/include/arch/x86/mem_space.hpp index 3a504bd..fd3ba06 100644 --- a/include/arch/x86/mem_space.hpp +++ b/include/arch/x86/mem_space.hpp @@ -93,6 +93,14 @@ namespace _detail { }; } +// x86 has TSO which is strong enough to not require barriers anywhere. + +template +using io_mem_ops = _detail::mem_ops; + +template +using main_mem_ops = _detail::mem_ops; + using _detail::mem_ops; } // namespace arch