diff --git a/docs/book.toml b/docs/book.toml
new file mode 100644
index 0000000..95e32d1
--- /dev/null
+++ b/docs/book.toml
@@ -0,0 +1,16 @@
+[book]
+authors = ["The Managarm Project"]
+language = "en"
+multilingual = false
+src = "src"
+title = "libarch documentation"
+
+[output.html]
+site-url = "/libarch/"
+git-repository-url = "https://github.com/managarm/libarch"
+mathjax-support = true
+
+[output.html.search]
+enable = true
+limit-results = 30
+use-boolean-and = true
diff --git a/docs/src/SUMMARY.md b/docs/src/SUMMARY.md
new file mode 100644
index 0000000..d6f8570
--- /dev/null
+++ b/docs/src/SUMMARY.md
@@ -0,0 +1,4 @@
+# Summary
+
+[libarch](index.md)
+- [Memory ordering](memory-order.md)
diff --git a/docs/src/index.md b/docs/src/index.md
new file mode 100644
index 0000000..f7e70e9
--- /dev/null
+++ b/docs/src/index.md
@@ -0,0 +1,14 @@
+# libarch
+
+libarch is a library to abstract over architecture specifics when accessing hardware (e.g., as part of device drivers).
+Among other things, it provides:
+- helpers to deal with bitfields,
+- interfaces for DMA buffer allocation,
+- DMA and MMIO barriers,
+- endianness conversion.
+
+Documentation permalink:
+
+## Projects using libarch
+
+ - [Managarm](https://managarm.org) - Pragmatic microkernel-based OS with fully asynchronous I/O.
diff --git a/docs/src/memory-order.md b/docs/src/memory-order.md
new file mode 100644
index 0000000..90b57c4
--- /dev/null
+++ b/docs/src/memory-order.md
@@ -0,0 +1,170 @@
+# Memory ordering
+
+## `arch::io_mem_space`
+
+`arch::io_mem_space` is intended to be used with I/O memory mappings (i.e., MMIO).
+**If `arch::io_mem_space` is used with other memory (e.g., main memory),
+the ordering guarantees that libarch provides may be weaker than stated below.**
+
+### `load()` and `store()`
+
+The `load()` and `store()` methods are intended to be suitable for most MMIO accesses that need to
+be done in device drivers. libarch guarantees the following constraints for `load()` and `store()`:
+1. All main memory accesses that occur _after_ `load()` in program order
+ are ordered _after_ the `load()` access
+ for all observers (i.e., all CPUs and devices)
+ which have coherent visibility of both memory acceses.
+2. All main memory accesses that occur _before_ `store()` in program order
+ are ordered _before_ the `store()` access
+ for all observers (i.e., all CPUs and devices)
+ which have coherent visibility of both memory acceses.
+3. `load()` and `store()` respect the I/O memory ordering of the underlying
+ memory mapping. In particular, if the underlying mapping does not permit
+ reordering, then `load()` and `store()` calls to the same device memory region are strongly ordered,
+ i.e., all observers observe all `load()` and `store()` accesses to the same device
+ memory region in the same order.
+ It is architecture-, bus-, and device-specific what a memory region is.
+4. `load()` interacts as expected (i.e., as if it was a main memory read)
+ with preceding load-acquire reads from main memory.
+ Likewise, `store()` interacts as expected (i.e., as if it was a main memory write)
+ with following store-release writes to main memory.
+
+Here, "coherent visbility" means that either the device is cache coherent with the
+CPU that performs the `load()` or `store()`, or that appropriate cache clean and/or invalidate
+operations are done.
+
+**Implications** of these constraints are:
+- Since constraint 1 orders main memory reads that follow `load()` in program order,
+ it is safe to read data written by a device-to-host transfer
+ from a DMA buffer after reading a device status register.
+- Since constraint 2 orders main memory writes that precede `store()` in program order,
+ it is safe to write a device doorbell register
+ after filling the DMA buffers of a host-to-device transfer.
+- The fact that constraint 1 also orders main memory writes that follow `load()` in program order
+ helps to prevent some surprises. For example, some devices require load accesses
+ to clear status conditions.
+ Likewise, since constraint 2 also orders main memory reads that precede store()
+ in program order,
+ reusing a DMA buffer for device-to-host transfers after reading from the buffer
+ does not require additional barriers.
+- Due to constraint 4, `load()` and `store()` cannot be moved out of (or into) mutexes.
+
+**Caveats.** Note that constraints 1 and 2 only order `load()` and `store()` relative to accesses
+to main memory. They do not affect the order of `load()` and `store()` relative to each other.
+Some examples of access patterns that are not covered by the guarantees above are:
+- Main memory accesses before `load()` or after `store()`:
+ - A read from main memory followed by a `load()` to device memory
+ (unless the read is a load-acquire).
+ - A write to main memory followed by `load()` of device memory.
+ - A `store()` to device memory followed by a read from main memory.
+ - A `store()` of device memory followed by a write to main memory
+ (unless the write is a store-release).
+- `load()` or `store()` to a device A followed by `load()` or `store()` to a another device B.
+- Ordering of `load()` against preceding store-release to main memory.
+ Likewise, ordering of `store()` against following load-acquire from main memory.
+
+**Implementation.**
+The following table depicts the architecture-specific barriers that are required
+to guarantee the constraints of `load()` and `store()`:
+
+|Architecture|Method|Before access|After access|
+|---|---|---|---|
+|Aarch64|`load()`||`dmb oshld`¹|
+|Aarch64|`store()`|`dmb osh`¹|
+|RISC-V|`load()`|`fence r, i`²|`fence i, rw`|
+|RISC-V|`store()`|`fence rw, o`|`fence o, w`²|
+
+¹ `dmb oshld` is enough to implement `read()` since it orders reads vs. reads and writes.
+On the other hand, `store()` requires `dmb osh`
+since `dmb oshst` only orders writes vs. writes.
+
+² `fence r, i` is required to order the `load()` access vs. earlier load-acquire on main memory.
+Likewise, `fence, o, w` is required to order the `store()` access
+vs. future store-release on main memory.
+
+**Rationale.**
+An alternative model could weaken the constraints to only require ordering
+of main memory reads in constraint 1 and ordering of main memory writes in constraint 2.
+We opt for the stronger model instead since it can prevent surprising behavior
+and we consider the extra barriers to be insignificant compared to the overall
+costs of MMIO accesses. In fact, on RISC-V, there is no difference in required barriers.
+On Aarch64, the only difference is the use of the
+stronger `dmb osh` for `store()` compared to the weaker `dmb oshst`
+(with no difference in the implementation of `load()`).
+
+### `load_relaxed()` and `store_relaxed()`
+
+`load_relaxed()` and `store_relaxed()` methods are intended for situations in
+which ordering of device access and main memory access is either not relevant
+or ensured by explicit barriers.
+
+Out of the properties that libarch guarantees for `load()` and `store()`,
+only constraint 3 (i.e., that the accesses respect the underlying I/O memory ordering)
+is guaranteed for `load_relaxed()` and `store_relaxed()`.
+
+**Caveats.** Note that this means that `load_relaxed()` and `store_relaxed()`
+may be moved out of (or into) mutexes if no extra barriers are used.
+
+## `arch::main_mem_space`
+
+`arch::main_mem_space` is intended to be used with main memory mappings
+that are accessed both from devices and from CPUs.
+For example, `arch::main_mem_space` is appropriate for descriptor
+rings in main memory that are accessed by devices.
+**If `arch::main_mem_space` is used with memory other than main memory,
+(e.g., device memory) the ordering guarantees may be weaker than stated below.**
+
+### `load()` and `store()`
+
+1. All main memory accesses that occur _after_ `load()` in program order
+ are ordered _after_ the `load()` access
+ for all observers (i.e., all CPUs and devices)
+ which have coherent visibility of both memory acceses.
+2. All main memory accesses that occur _before_ `store()` in program order
+ are ordered _before_ the `store()` access
+ for all observers (i.e., all CPUs and devices)
+ which have coherent visibility of both memory acceses.
+3. `load()` and `store()` interact as usual memory access with the C++11 memory model.
+
+See `arch::io_mem_space` for **implications**, **caveats** and **rationale** of these
+constraints.
+
+**Implementation.**
+The following table depicts the architecture-specific barriers that are required
+to guarantee the constraints of `load()` and `store()`:
+
+|Architecture|Method|Before access|After access|
+|---|---|---|---|
+|Aarch64|`load()`||`dmb ishld`¹|
+|Aarch64|`store()`|`dmb ish`¹|
+|RISC-V|`load()`|²|`fence r, rw`|
+|RISC-V|`store()`|`fence rw, w`|²|
+
+¹ See explanation for `arch::io_mem_space`.
+Also note that inner shareable barriers are enough for main memory accesses.
+
+² In contrast to `arch::io_mem_space`, we do not need barriers here
+since load-acquire and store-release will already be ordered correctly
+relative to main memory accesses.
+
+## `arch::mem_space`
+
+`arch::mem_space` simultaneously provides the guarantees of `arch::io_mem_space`
+and `arch::main_mem_space`. **Its use is discouraged unless the underlying memory region
+may be mapped as either device memory or as main memory.**
+
+**Implementation.**
+The following table depicts the architecture-specific barriers that are required
+to guarantee the constraints of `load()` and `store()`:
+
+|Architecture|Method|Before access|After access|
+|---|---|---|---|
+|Aarch64|`load()`||`dmb oshld`¹|
+|Aarch64|`store()`|`dmb osh`¹|
+|RISC-V|`load()`|`fence r, i`²|`fence ir, rw`³|
+|RISC-V|`store()`|`fence rw, ow`³|`fence o, w`²|
+
+¹ ² See explanation for `arch::io_mem_space`.
+
+³ This barrier needs to be strong enough to order
+`load()` and `store()` to both main memory and I/O memory.
diff --git a/include/arch/aarch64/mem_space.hpp b/include/arch/aarch64/mem_space.hpp
index fa49ede..010154d 100644
--- a/include/arch/aarch64/mem_space.hpp
+++ b/include/arch/aarch64/mem_space.hpp
@@ -168,6 +168,34 @@ namespace _detail {
};
}
+template
+struct io_mem_ops {
+ static B load(const B *p) {
+ auto v = _detail::mem_ops::load_relaxed(p);
+ asm volatile("dmb oshld" ::: "memory");
+ return v;
+ }
+
+ static void store(B *p, B v) {
+ asm volatile("dmb osh" ::: "memory");
+ _detail::mem_ops::store_relaxed(p, v);
+ }
+};
+
+template
+struct main_mem_ops {
+ static B load(const B *p) {
+ auto v = _detail::mem_ops::load_relaxed(p);
+ asm volatile("dmb ishld" ::: "memory");
+ return v;
+ }
+
+ static void store(B *p, B v) {
+ asm volatile("dmb ish" ::: "memory");
+ _detail::mem_ops::store_relaxed(p, v);
+ }
+};
+
using _detail::mem_ops;
} // namespace arch
diff --git a/include/arch/arm/mem_space.hpp b/include/arch/arm/mem_space.hpp
index bf11e47..ebb3205 100644
--- a/include/arch/arm/mem_space.hpp
+++ b/include/arch/arm/mem_space.hpp
@@ -123,6 +123,14 @@ namespace _detail {
};
}
+// TODO: This is not correct.
+template
+using io_mem_ops = mem_ops;
+
+// TODO: This is not correct.
+template
+using main_mem_ops = mem_ops;
+
using _detail::mem_ops;
} // namespace arch
diff --git a/include/arch/mem_space.hpp b/include/arch/mem_space.hpp
index a44d324..964bb3b 100644
--- a/include/arch/mem_space.hpp
+++ b/include/arch/mem_space.hpp
@@ -14,31 +14,34 @@
namespace arch {
-struct mem_space {
- constexpr mem_space()
+namespace _details {
+
+template typename Ops>
+struct base_mem_space {
+ constexpr base_mem_space()
: _base(0) { }
- constexpr mem_space(uintptr_t base)
+ constexpr base_mem_space(uintptr_t base)
: _base(base) { }
- mem_space(void *base)
+ base_mem_space(void *base)
: _base(reinterpret_cast(base)) { }
- mem_space subspace(ptrdiff_t offset) const {
- return mem_space(reinterpret_cast(_base + offset));
+ base_mem_space subspace(ptrdiff_t offset) const {
+ return base_mem_space(reinterpret_cast(_base + offset));
}
template
void store(RT r, typename RT::rep_type value) const {
auto p = reinterpret_cast(_base + r.offset());
auto v = static_cast(value);
- mem_ops::store(p, v);
+ Ops::store(p, v);
}
template
typename RT::rep_type load(RT r) const {
auto p = reinterpret_cast(_base + r.offset());
- auto b = mem_ops::load(p);
+ auto b = Ops::load(p);
return static_cast(b);
}
@@ -46,13 +49,13 @@ struct mem_space {
void store_relaxed(RT r, typename RT::rep_type value) const {
auto p = reinterpret_cast(_base + r.offset());
auto v = static_cast(value);
- mem_ops::store_relaxed(p, v);
+ Ops::store_relaxed(p, v);
}
template
typename RT::rep_type load_relaxed(RT r) const {
auto p = reinterpret_cast(_base + r.offset());
- auto b = mem_ops::load_relaxed(p);
+ auto b = Ops::load_relaxed(p);
return static_cast(b);
}
@@ -60,6 +63,12 @@ struct mem_space {
uintptr_t _base;
};
+} // namespace _details
+
+using io_mem_space = _details::base_mem_space;
+using main_mem_space = _details::base_mem_space;
+using mem_space = _details::base_mem_space;
+
static constexpr mem_space global_mem{};
} // namespace arch
diff --git a/include/arch/riscv64/mem_space.hpp b/include/arch/riscv64/mem_space.hpp
index 68c8dd0..3c9a432 100644
--- a/include/arch/riscv64/mem_space.hpp
+++ b/include/arch/riscv64/mem_space.hpp
@@ -91,6 +91,36 @@ namespace _detail {
};
}
+template
+struct io_mem_ops {
+ static B load(const B *p) {
+ asm volatile("fence r, i" ::: "memory");
+ auto v = _detail::mem_ops::load_relaxed(p);
+ asm volatile("fence i, rw" ::: "memory");
+ return v;
+ }
+
+ static void store(B *p, B v) {
+ asm volatile("fence rw, o" ::: "memory");
+ _detail::mem_ops::store_relaxed(p, v);
+ asm volatile("fence o, w" ::: "memory");
+ }
+};
+
+template
+struct main_mem_ops {
+ static B load(const B *p) {
+ auto v = _detail::mem_ops::load_relaxed(p);
+ asm volatile("fence r, rw" ::: "memory");
+ return v;
+ }
+
+ static void store(B *p, B v) {
+ asm volatile("fence rw, w" ::: "memory");
+ _detail::mem_ops::store_relaxed(p, v);
+ }
+};
+
using _detail::mem_ops;
} // namespace arch
diff --git a/include/arch/x86/mem_space.hpp b/include/arch/x86/mem_space.hpp
index 3a504bd..fd3ba06 100644
--- a/include/arch/x86/mem_space.hpp
+++ b/include/arch/x86/mem_space.hpp
@@ -93,6 +93,14 @@ namespace _detail {
};
}
+// x86 has TSO which is strong enough to not require barriers anywhere.
+
+template
+using io_mem_ops = _detail::mem_ops;
+
+template
+using main_mem_ops = _detail::mem_ops;
+
using _detail::mem_ops;
} // namespace arch