From e420faf5012b7ff85f3e3e57864951bb168d279e Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Fri, 1 Dec 2023 14:39:30 +0100 Subject: [PATCH 1/3] impl guide: update PVF host page; add diagrams Would have been nice to have some helpful diagrams a while ago, but I just thought of it. :P --- polkadot/roadmap/implementers-guide/README.md | 2 +- .../src/node/utility/candidate-validation.md | 25 +++++ .../src/node/utility/pvf-host-and-workers.md | 91 +++++++++++++++++-- 3 files changed, 108 insertions(+), 10 deletions(-) diff --git a/polkadot/roadmap/implementers-guide/README.md b/polkadot/roadmap/implementers-guide/README.md index e03c0c45ddba..abff017138c0 100644 --- a/polkadot/roadmap/implementers-guide/README.md +++ b/polkadot/roadmap/implementers-guide/README.md @@ -8,7 +8,7 @@ This is available [here](https://paritytech.github.io/polkadot-sdk/book/). ## Local build -To view it locally from the repo root: +To view it locally, run the following (from the `polkadot/` directory): Ensure graphviz is installed: diff --git a/polkadot/roadmap/implementers-guide/src/node/utility/candidate-validation.md b/polkadot/roadmap/implementers-guide/src/node/utility/candidate-validation.md index e252ec237b79..1a3ff1c6aff0 100644 --- a/polkadot/roadmap/implementers-guide/src/node/utility/candidate-validation.md +++ b/polkadot/roadmap/implementers-guide/src/node/utility/candidate-validation.md @@ -5,6 +5,31 @@ This subsystem is responsible for handling candidate validation requests. It is A variety of subsystems want to know if a parachain block candidate is valid. None of them care about the detailed mechanics of how a candidate gets validated, just the results. This subsystem handles those details. +## High-Level Flow + +```dot process +digraph { + rankdir="LR"; + + pre [label = "Pvf-Checker"; shape = square] + bac [label = "Backing"; shape = square] + app [label = "Approval\nVoting"; shape = square] + dis [label = "Dispute\nCoordinator"; shape = square] + + can [label = "Candidate\nValidation"; shape = square] + + pvf [label = "PVF Host"; shape = square] + + pre -> can [style = dashed] + bac -> can + app -> can + dis -> can + + can -> pvf [label = "Precheck"; style = dashed] + can -> pvf [label = "Validate"] +} +``` + ## Protocol Input: [`CandidateValidationMessage`](../../types/overseer-protocol.md#validation-request-type) diff --git a/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md b/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md index 56bdd48bc0c3..a8d839202373 100644 --- a/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md +++ b/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md @@ -2,12 +2,80 @@ The PVF host is responsible for handling requests to prepare and execute PVF code blobs, which it sends to PVF **workers** running in their own child -processes. +processes. These workers are spawned from the `polkadot-prepare-worker` and +`polkadot-execute-worker` binaries. While the workers are generally long-living, they also spawn one-off secure **job processes** that perform the jobs. See "Job Processes" section below. -This system has two high-levels goals that we will touch on here: *determinism* +## High-Level Flow + +```dot process +digraph { + rankdir="LR"; + + can [label = "Candidate\nValidation\nSubsystem"; shape = square] + + pvf [label = "PVF Host"; shape = square] + + pp [label = "Prepare\nPool"; shape = square] + eq [label = "Execute\nQueue"; shape = square] + + subgraph "cluster partial_sandbox_1" { + label = "polkadot-prepare-worker\n(Partial Sandbox)\n\n\n"; + labelloc = "t"; + + pw [label = "Prepare\nWorker"; shape = square] + + subgraph "cluster full_sandbox_1" { + label = "Fully Isolated Sandbox\n\n\n"; + labelloc = "t"; + + pj [label = "Prepare\nJob"; shape = square] + } + } + + subgraph "cluster partial_sandbox_2" { + label = "polkadot-execute-worker\n(Partial Sandbox)\n\n\n"; + labelloc = "t"; + + ew [label = "Execute\nWorker"; shape = square] + + subgraph "cluster full_sandbox_2" { + label = "Fully Isolated Sandbox\n\n\n"; + labelloc = "t"; + + ej [label = "Execute\nJob"; shape = square] + } + } + + can -> pvf [label = "Precheck"; style = dashed] + can -> pvf [label = "Validate"] + + pvf -> pp [label = "Prepare"; style = dashed] + pvf -> eq [label = "Execute";] + pvf -> pvf [label = "see (2) and (3)"; style = dashed] + + pp -> pw [style = dashed] + eq -> ew + + pw -> pj [style = dashed] + ew -> ej +} +``` + +Some notes about the graph: + +1. Once a job has finished, the response will flow back up the way it came. +2. In the case of execution, the host will send a request for preparation to the + Prepare Pool if needed. In that case, only after the preparation succeeds + does the Execute Queue continue with validation. +3. Multiple requests for preparing the same artifact are coalesced, so that the + work is only done once. + +## Goals + +This system has two high-level goals that we will touch on here: *determinism* and *security*. ## Determinism @@ -142,19 +210,24 @@ So what are we actually worried about? Things that come to mind: 6. **Intercepting and manipulating packages** - Effect very similar to the above, hard to do without also being able to do 4 or 5. +We do not protect against (1), (2), and (3), because there are too many sources +of randomness for an attacker to exploit. + +We provide very good protection against (4), (5), and (6). + ### Job Processes As mentioned above, our architecture includes long-living **worker processes** -and one-off **job processes*. This separation is important so that the handling +and one-off **job processes**. This separation is important so that the handling of untrusted code can be limited to the job processes. A hijacked job process can therefore not interfere with other jobs running in separate processes. -Furthermore, if an unexpected execution error occurred in the worker and not the -job, we generally can be confident that it has nothing to do with the candidate, -so we can abstain from voting. On the other hand, a hijacked job can send back -erroneous responses for candidates, so we know that we should not abstain from -voting on such errors from jobs. Otherwise, an attacker could trigger a finality -stall. (See "Internal Errors" section above.) +Furthermore, if an unexpected execution error occurred in the execution worker +and not the job itself, we generally can be confident that it has nothing to do +with the candidate, so we can abstain from voting. On the other hand, a hijacked +job is able to send back erroneous responses for candidates, so we know that we +should not abstain from voting on such errors from jobs. Otherwise, an attacker +could trigger a finality stall. (See "Internal Errors" section above.) ### Restricting file-system access From 7167f2a1688f42a6b627001ee005dbee4f698610 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Sun, 3 Dec 2023 11:59:05 +0100 Subject: [PATCH 2/3] Fix bad link --- .../implementers-guide/src/node/utility/pvf-prechecker.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/polkadot/roadmap/implementers-guide/src/node/utility/pvf-prechecker.md b/polkadot/roadmap/implementers-guide/src/node/utility/pvf-prechecker.md index f0de50f2267b..7f6fef7ddf63 100644 --- a/polkadot/roadmap/implementers-guide/src/node/utility/pvf-prechecker.md +++ b/polkadot/roadmap/implementers-guide/src/node/utility/pvf-prechecker.md @@ -8,9 +8,9 @@ pre-checking. Head over to [overview] for the PVF pre-checking process overview. There is no dedicated input mechanism for PVF pre-checker. Instead, PVF pre-checker looks on the `ActiveLeavesUpdate` event stream for work. -This subsytem does not produce any output messages either. The subsystem will, however, send messages to the [Runtime -API] subsystem to query for the pending PVFs and to submit votes. In addition to that, it will also communicate with -[Candidate Validation] Subsystem to request PVF pre-check. +This subsytem does not produce any output messages either. The subsystem will, however, send messages to the +[Runtime API] subsystem to query for the pending PVFs and to submit votes. In addition to that, it will also +communicate with [Candidate Validation] Subsystem to request PVF pre-check. ## Functionality From 9f72c21f7ee49b4306a75a97e085ee173021d8a1 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Sun, 3 Dec 2023 15:36:47 +0100 Subject: [PATCH 3/3] Add prepare queue --- .../src/node/utility/pvf-host-and-workers.md | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md b/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md index a8d839202373..e0984bd58d1d 100644 --- a/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md +++ b/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md @@ -18,16 +18,17 @@ digraph { pvf [label = "PVF Host"; shape = square] - pp [label = "Prepare\nPool"; shape = square] + pq [label = "Prepare\nQueue"; shape = square] eq [label = "Execute\nQueue"; shape = square] + pp [label = "Prepare\nPool"; shape = square] - subgraph "cluster partial_sandbox_1" { + subgraph "cluster partial_sandbox_prep" { label = "polkadot-prepare-worker\n(Partial Sandbox)\n\n\n"; labelloc = "t"; pw [label = "Prepare\nWorker"; shape = square] - subgraph "cluster full_sandbox_1" { + subgraph "cluster full_sandbox_prep" { label = "Fully Isolated Sandbox\n\n\n"; labelloc = "t"; @@ -35,13 +36,13 @@ digraph { } } - subgraph "cluster partial_sandbox_2" { + subgraph "cluster partial_sandbox_exec" { label = "polkadot-execute-worker\n(Partial Sandbox)\n\n\n"; labelloc = "t"; ew [label = "Execute\nWorker"; shape = square] - subgraph "cluster full_sandbox_2" { + subgraph "cluster full_sandbox_exec" { label = "Fully Isolated Sandbox\n\n\n"; labelloc = "t"; @@ -52,9 +53,10 @@ digraph { can -> pvf [label = "Precheck"; style = dashed] can -> pvf [label = "Validate"] - pvf -> pp [label = "Prepare"; style = dashed] + pvf -> pq [label = "Prepare"; style = dashed] pvf -> eq [label = "Execute";] pvf -> pvf [label = "see (2) and (3)"; style = dashed] + pq -> pp [style = dashed] pp -> pw [style = dashed] eq -> ew @@ -68,7 +70,7 @@ Some notes about the graph: 1. Once a job has finished, the response will flow back up the way it came. 2. In the case of execution, the host will send a request for preparation to the - Prepare Pool if needed. In that case, only after the preparation succeeds + Prepare Queue if needed. In that case, only after the preparation succeeds does the Execute Queue continue with validation. 3. Multiple requests for preparing the same artifact are coalesced, so that the work is only done once.