From 79d36c8241c7b55805a5f63eaabffd8af37184cb Mon Sep 17 00:00:00 2001 From: joshieDo Date: Thu, 29 Feb 2024 12:53:12 +0000 Subject: [PATCH] update docs and diagrams --- crates/etl/src/lib.rs | 7 +- crates/stages/src/stages/bodies.rs | 4 +- crates/stages/src/stages/execution.rs | 3 +- crates/stages/src/stages/headers.rs | 21 ++- crates/stages/src/stages/tx_lookup.rs | 4 +- crates/static-file/README.md | 125 +++++++++++------- .../provider/src/providers/database/mod.rs | 2 +- 7 files changed, 99 insertions(+), 67 deletions(-) diff --git a/crates/etl/src/lib.rs b/crates/etl/src/lib.rs index 0fc865b98d23..45ff65f6bbb3 100644 --- a/crates/etl/src/lib.rs +++ b/crates/etl/src/lib.rs @@ -30,9 +30,12 @@ use tempfile::{NamedTempFile, TempDir}; /// An ETL (extract, transform, load) data collector. /// /// Data is pushed (extract) to the collector which internally flushes the data in a sorted -/// (transform) manner to files of some specified capacity. +/// (transform) manner to files of some specified capacity. the data can later be iterated over +/// (load) in a sorted manner. /// -/// The data can later be iterated over (load) in a sorted manner. +/// Used mainly to insert data into `MDBX` in a sorted manner. This is important because performance +/// and storage space degrades greatly if the data is inserted unsorted (eg. tables with hashes as keys.) as +/// opposed to append & sorted insert. Some benchmarks can be found [here](https://github.com/paradigmxyz/reth/pull/1130#issuecomment-1418642755). #[derive(Debug)] pub struct Collector where diff --git a/crates/stages/src/stages/bodies.rs b/crates/stages/src/stages/bodies.rs index 459eace72074..6893350af770 100644 --- a/crates/stages/src/stages/bodies.rs +++ b/crates/stages/src/stages/bodies.rs @@ -25,7 +25,7 @@ use tracing::*; // TODO(onbjerg): Metrics and events (gradual status for e.g. CLI) /// The body stage downloads block bodies. /// -/// The body stage downloads block bodies for all block headers stored locally in the database. +/// The body stage downloads block bodies for all block headers stored locally in storage. /// /// # Empty blocks /// @@ -33,7 +33,7 @@ use tracing::*; /// no transactions will not have a block body downloaded for them, since it would be meaningless to /// do so. /// -/// This also means that if there is no body for the block in the database (assuming the +/// This also means that if there is no body for the block in storage (assuming the /// block number <= the synced block of this stage), then the block can be considered empty. /// /// # Tables diff --git a/crates/stages/src/stages/execution.rs b/crates/stages/src/stages/execution.rs index 724603e41c03..6a1688d758b3 100644 --- a/crates/stages/src/stages/execution.rs +++ b/crates/stages/src/stages/execution.rs @@ -458,7 +458,8 @@ impl Stage for ExecutionStage { } } } else { - // We database for Receipts, if there is any kind of receipt pruning/filtering. + // We use database for Receipts, if there is any kind of receipt pruning/filtering, + // since it is not supported by static files. let mut cursor = tx.cursor_write::()?; let mut reverse_walker = cursor.walk_back(None)?; diff --git a/crates/stages/src/stages/headers.rs b/crates/stages/src/stages/headers.rs index d34ffa46ba8e..740445c24e0c 100644 --- a/crates/stages/src/stages/headers.rs +++ b/crates/stages/src/stages/headers.rs @@ -34,17 +34,15 @@ use tracing::*; /// The headers stage. /// -/// The headers stage downloads all block headers from the highest block in the local database to +/// The headers stage downloads all block headers from the highest block in storage to /// the perceived highest block on the network. /// -/// The headers are processed and data is inserted into these tables: +/// The headers are processed and data is inserted into static files, as well as into the +/// [`HeaderNumbers`][reth_db::tables::HeaderNumbers] table. /// -/// - [`HeaderNumbers`][reth_db::tables::HeaderNumbers] -/// - [`Headers`][reth_db::tables::Headers] -/// - [`CanonicalHeaders`][reth_db::tables::CanonicalHeaders] -/// -/// NOTE: This stage downloads headers in reverse. Upon returning the control flow to the pipeline, -/// the stage checkpoint is not updated until this stage is done. +/// NOTE: This stage downloads headers in reverse and pushes them to the ETL [`Collector`]. It then +/// proceeds to push them sequentially to static files. The stage checkpoint is not updated until +/// this stage is done. #[derive(Debug)] pub struct HeaderStage { /// Database handle. @@ -91,11 +89,10 @@ where } } - /// Write downloaded headers to the given transaction from ETL. + /// Write downloaded headers to storage from ETL. /// - /// Writes to the following tables: - /// [`tables::Headers`], [`tables::CanonicalHeaders`], [`tables::HeaderTerminalDifficulties`] - /// and [`tables::HeaderNumbers`]. + /// Writes to static files ( `Header | HeaderTD | HeaderHash` ) and [`tables::HeaderNumbers`] + /// database table. fn write_headers( &mut self, tx: &::TXMut, diff --git a/crates/stages/src/stages/tx_lookup.rs b/crates/stages/src/stages/tx_lookup.rs index a619fe709b57..6f2f72df660b 100644 --- a/crates/stages/src/stages/tx_lookup.rs +++ b/crates/stages/src/stages/tx_lookup.rs @@ -23,8 +23,8 @@ use tracing::*; /// The transaction lookup stage. /// -/// This stage walks over the bodies table, and sets the transaction hash of each transaction in a -/// block to the corresponding `BlockNumber` at each block. This is written to the +/// This stage walks over existing transactions, and sets the transaction hash of each transaction +/// in a block to the corresponding `BlockNumber` at each block. This is written to the /// [`tables::TransactionHashNumbers`] This is used for looking up changesets via the transaction /// hash. /// diff --git a/crates/static-file/README.md b/crates/static-file/README.md index 1d455475a595..b6eb385dd9ef 100644 --- a/crates/static-file/README.md +++ b/crates/static-file/README.md @@ -4,85 +4,116 @@ Data that has reached a finalized state and won't undergo further changes (essentially frozen) should be read without concerns of modification. This makes it unsuitable for traditional databases. -This crate aims to copy this data from the current database to multiple static files, aggregated by block ranges. At every 500_000th block new static files are created. +This crate aims to copy this data from the current database to multiple static files, aggregated by block ranges. At every 500_000th block, a new static file is created. -Below are two diagrams illustrating the processes of creating static files (custom format: `NippyJar`) and querying them. A glossary is also provided to explain the different (linked) components involved in these processes. +Below are four diagrams illustrating on how data is served from static files to the provider. A glossary is also provided to explain the different (linked) components involved in these processes. + + +### Query Diagrams ([`Provider`](../../crates/storage/provider/src/providers/database/mod.rs#L41))
- Creation diagram (StaticFileProducer) + By block number ```mermaid graph TD; - I("BLOCK_HEIGHT % 500_000 == 0")--triggers-->SP(StaticFileProducer) - SP --> |triggers| SH["create_static_file(block_range, StaticFileSegment::Headers)"] - SP --> |triggers| ST["create_static_file(block_range, StaticFileSegment::Transactions)"] - SP --> |triggers| SR["create_static_file(block_range, StaticFileSegment::Receipts)"] - SP --> |triggers| ETC["create_static_file(block_range, ...)"] - SH --> CS["create_static_file::< T >(DatabaseCursor)"] - ST --> CS - SR --> CS - ETC --> CS - CS --> |create| IF(NippyJar::InclusionFilters) - CS -- iterates --> DC(DatabaseCursor) -->HN{HasNext} - HN --> |true| NJC(NippyJar::Compression) - NJC --> HN - NJC --store--> NJ - HN --> |false| NJ - IF --store--> NJ(NippyJar) - NJ --freeze--> F(File) - F--"on success"--> SP1(StaticFileProducer) - SP1 --"sends BLOCK_HEIGHT"--> HST(HighestStaticFileTracker) - HST --"read by"-->Pruner - HST --"read by"-->DatabaseProvider - HST --"read by"-->SnapsotProvider - HST --"read by"-->ProviderFactory - + RPC-->P + P("Provider::header(block_number)")-->PF(ProviderFactory) + PF--get_-->DC1{block_number
>
highest static file block} + DC1 --> |true| PD1("DatabaseProvider::header(block_number)") + DC1 --> |false| SFP("StaticFileProvider::header(block_number)") + PD1 --> MDBX + SFP --find block range from block number--> JP("StaticFileJarProvider::header(block_number)") + JP --"creates"-->SC(StaticFileCursor) + SC --".get_one< HeaderMask< Header > >(number)"--->NJC("NippyJarCursor") + NJC--".row_by_number(row_index, mask)"-->NJ[NippyJar] + NJ--"&[u8]"-->NJC + NJC--"&[u8]"-->SC + SC--"Header"--> JP + JP--"Header"--> SFP ```
+
+ By block hash + +```mermaid +graph TD; + RPC-->P + P("Provider::block_by_hash(block_number)")-->PF(ProviderFactory) + PF --> PD1("DatabaseProvider::block_id(block_hash)") + PD1 --block number--> DC1{block_number
>
highest static file block} + DC1 --> |true| PD2("DatabaseProvider::block_by_id(block_number)") + DC1 --> |false| SFP("StaticFileProvider::block_by_id(block_number)") + PD2 --> MDBX + SFP --find block range from block number--> JP("StaticFileJarProvider::block_by_id(block_number)") + JP --"creates"-->SC(StaticFileCursor) + SC --".get_one< HeaderMask< Header > >(number)"--->NJC("NippyJarCursor") + NJC--".row_by_number(row_index, mask)"-->NJ[NippyJar] + NJ--"&[u8]"-->NJC + NJC--"&[u8]"-->SC + SC--"Header"--> JP + JP--"Header"--> SFP +``` +
- Query diagram (Provider) + By transaction number ```mermaid graph TD; RPC-->P - P("Provider::header(block_number)")-->PF(ProviderFactory) - PF--shares-->SP1("Arc(StaticFileProvider)") - SP1--shares-->PD(DatabaseProvider) - PF--creates-->PD - PD--check `HighestStaticFileTracker`-->PD - PD-->DC1{block_number
>
highest static_file block} - DC1 --> |true| PD1("DatabaseProvider::header(block_number)") - DC1 --> |false| ASP("StaticFileProvider::header(block_number)") + P("Provider::transaction_by_id(transaction_number)")-->PF(ProviderFactory) + PF--get_-->DC1{transaction_number
>
highest static file transaction} + DC1 --> |true| PD1("DatabaseProvider::transaction_by_id(transaction_number)") + DC1 --> |false| SFP("StaticFileProvider::transaction_by_id(transaction_number)") PD1 --> MDBX - ASP --find correct jar and creates--> JP("StaticFileJarProvider::header(block_number)") + SFP --find block range from transaction number--> JP("StaticFileJarProvider::transaction_by_id(transaction_number)") JP --"creates"-->SC(StaticFileCursor) SC --".get_one< HeaderMask< Header > >(number)"--->NJC("NippyJarCursor") NJC--".row_by_number(row_index, mask)"-->NJ[NippyJar] NJ--"&[u8]"-->NJC NJC--"&[u8]"-->SC SC--"Header"--> JP - JP--"Header"--> ASP + JP--"Header"--> SFP ```
+
+ By transaction hash + +```mermaid +graph TD; + RPC-->P + P("Provider::transaction_by_hash(transaction_number)")-->PF(ProviderFactory) + PF --> PD1("DatabaseProvider::transaction_id(transaction_hash)") + PD1 --transaction number--> DC1{transaction_number
>
highest static file transaction} + DC1 --> |true| PD2("DatabaseProvider::transaction_by_id(transaction_number)") + DC1 --> |false| SFP("StaticFileProvider::transaction_by_id(transaction_number)") + PD2 --> MDBX + SFP --find block range from transaction number--> JP("StaticFileJarProvider::transaction_by_id(transaction_number)") + JP --"creates"-->SC(StaticFileCursor) + SC --".get_one< HeaderMask< Header > >(number)"--->NJC("NippyJarCursor") + NJC--".row_by_number(row_index, mask)"-->NJ[NippyJar] + NJ--"&[u8]"-->NJC + NJC--"&[u8]"-->SC + SC--"Header"--> JP + JP--"Header"--> SFP +``` +
### Glossary In descending order of abstraction hierarchy: -[`StaticFileProducer`](../../crates/static_file/src/static_file_producer.rs#L20): A `reth` background service that **copies** data from the database to new static-file files when the block height reaches a certain threshold (e.g., `500_000th`). Upon completion, it dispatches a notification about the higher static file block to `HighestStaticFileTracker` channel. **It DOES NOT remove data from the database.** - -[`HighestStaticFileTracker`](../../crates/static_file/src/static_file_producer.rs#L22): A channel utilized by `StaticFileProducer` to announce the newest static_file block to all components with a listener: `Pruner` (to know which additional tables can be pruned) and `DatabaseProvider` (to know which data can be queried from the static files). +[`StaticFileProducer`](../../crates/static-file/src/static_file_producer.rs#L25): A `reth` [hook](../../crates/consensus/beacon/src/engine/hooks/static_file.rs) service that when triggered, **copies** finalized data from the database to the latest static file. Upon completion, it updates the internal index at `StaticFileProvider` with the new highest block and transaction on each specific segment. -[`StaticFileProvider`](../../crates/storage/provider/src/providers/static_file/manager.rs#L15) A provider similar to `DatabaseProvider`, **managing all existing static_file files** and selecting the optimal one (by range and segment type) to fulfill a request. **A single instance is shared across all components and should be instantiated only once within `ProviderFactory`**. An immutable reference is given everytime `ProviderFactory` creates a new `DatabaseProvider`. +[`StaticFileProvider`](../../crates/storage/provider/src/providers/static_file/manager.rs#L44) A provider similar to `DatabaseProvider`, **managing all existing static_file files** and selecting the optimal one (by range and segment type) to fulfill a request. **A single instance is shared across all components and should be instantiated only once within `ProviderFactory`**. An immutable reference is given everytime `ProviderFactory` creates a new `DatabaseProvider`. -[`StaticFileJarProvider`](../../crates/storage/provider/src/providers/static_file/jar.rs#L42) A provider similar to `DatabaseProvider` that provides access to a **single static_file file**. +[`StaticFileJarProvider`](../../crates/storage/provider/src/providers/static_file/jar.rs#L42) A provider similar to `DatabaseProvider` that provides access to a **single static file segment data** one a specific block range. -[`StaticFileCursor`](../../crates/storage/db/src/static_file/cursor.rs#L12) An elevated abstraction of `NippyJarCursor` for simplified access. It associates the bitmasks with type decoding. For instance, `cursor.get_two::>(tx_number)` would yield `Tx` and `Signature`, eliminating the need to manage masks or invoke a decoder/decompressor. +[`StaticFileCursor`](../../crates/storage/db/src/static_file/cursor.rs#L11) An elevated abstraction of `NippyJarCursor` for simplified access. It associates the bitmasks with type decoding. For instance, `cursor.get_two::>(tx_number)` would yield `Tx` and `Signature`, eliminating the need to manage masks or invoke a decoder/decompressor. -[`StaticFileSegment`](../../crates/primitives/src/static_file/segment.rs#L10) Each static_file file only contains data of a specific segment, e.g., `Headers`, `Transactions`, or `Receipts`. +[`StaticFileSegment`](../../crates/primitives/src/static_file/segment.rs#L10) Each static file only contains data of a specific segment, e.g., `Headers`, `Transactions`, or `Receipts`. -[`NippyJarCursor`](../../crates/storage/nippy-jar/src/cursor.rs#L12) Accessor of data in a `NippyJar` file. It enables queries either by row number (e.g., block number 1) or by a predefined key not part of the file (e.g., transaction hashes). If a file has multiple columns (e.g., `Tx | TxSender | Signature`), and one wishes to access only one of the column values, this can be accomplished by bitmasks. (e.g., for `TxSender`, the mask would be `0b010`). +[`NippyJarCursor`](../../crates/storage/nippy-jar/src/cursor.rs#L12) Accessor of data in a `NippyJar` file. It enables queries either by row number (e.g., block number 1) or by a predefined key not part of the file (e.g., transaction hashes). **Currently, only queries by row number are being used.** If a file has multiple columns (e.g., `Header | HeaderTD | HeaderHash`), and one wishes to access only one of the column values, this can be accomplished by bitmasks. (e.g., for `HeaderTD`, the mask would be `0b010`). -[`NippyJar`](../../crates/storage/nippy-jar/src/lib.rs#57) A create-only file format. No data can be appended after creation. It supports multiple columns, compression (e.g., Zstd (with and without dictionaries), lz4, uncompressed) and inclusion filters (e.g., cuckoo filter: `is hash X part of this dataset`). StaticFiles are organized by block ranges. (e.g., `TransactionStaticFile_499_999.jar` contains a transaction per row for all transactions from block `0` to block `499_999`). For more check the struct documentation. +[`NippyJar`](../../crates/storage/nippy-jar/src/lib.rs#92) An append-or-truncate-only file format. It supports multiple columns, compression (e.g., Zstd (with and without dictionaries), lz4, uncompressed) and inclusion filters (e.g., cuckoo filter: `is hash X part of this dataset`). StaticFiles are organized by block ranges. (e.g., `TransactionStaticFile_0_-_499_999.jar` contains a transaction per row for all transactions between block `0` and block `499_999`). For more check the struct documentation. diff --git a/crates/storage/provider/src/providers/database/mod.rs b/crates/storage/provider/src/providers/database/mod.rs index 6752a7d8a4be..7393043c4e31 100644 --- a/crates/storage/provider/src/providers/database/mod.rs +++ b/crates/storage/provider/src/providers/database/mod.rs @@ -34,7 +34,7 @@ mod provider; pub use provider::{DatabaseProvider, DatabaseProviderRO, DatabaseProviderRW}; use reth_db::mdbx::DatabaseArguments; -/// A common provider that fetches data from a database. +/// A common provider that fetches data from a database or static file. /// /// This provider implements most provider or provider factory traits. #[derive(Debug, Clone)]