Skip to content

Commit

Permalink
feat: Archiving of prover in gpu_prover_queue (#1537)
Browse files Browse the repository at this point in the history
## What ❔

Add archiver for provers in gpu_prover_queue, which will move all
provers, whose status was dead during some time to archive.
Add availability checker for provers, which will check whether prover
wasn't marked dead while being alive, and shut down it if so.

## Why ❔

To improve prover performance and prevent incidents with provers marked
dead while being alive(autoscalers won't scale provers more, because
they see that prover is alive)

## Checklist

<!-- Check your PR fulfills the following items. -->
<!-- For draft PRs check the boxes as you complete them. -->

- [x] PR title corresponds to the body of PR (we generate changelog
entries from PRs).
- [x] Tests for the changes have been added / updated.
- [x] Documentation comments have been added / updated.
- [x] Code has been formatted via `zk fmt` and `zk lint`.
- [x] Spellcheck has been run via `zk spellcheck`.
- [x] Linkcheck has been run via `zk linkcheck`.
  • Loading branch information
Artemka374 committed Apr 4, 2024
1 parent 6e9ed8c commit a970629
Show file tree
Hide file tree
Showing 29 changed files with 499 additions and 240 deletions.
1 change: 1 addition & 0 deletions checks-config/era.dic
Original file line number Diff line number Diff line change
Expand Up @@ -927,3 +927,4 @@ StorageMarker
SIGINT
opentelemetry
PubdataSendingMode
FriGpuProverArchiver
18 changes: 16 additions & 2 deletions core/lib/basic_types/src/prover_dal.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//! Types exposed by the prover DAL for general-purpose use.
use std::{net::IpAddr, ops::Add};
use std::{net::IpAddr, ops::Add, str::FromStr};

use chrono::{DateTime, Duration, Utc};

Expand Down Expand Up @@ -204,7 +204,7 @@ pub struct JobExtendedStatistics {
pub active_area: Vec<ProverJobInfo>,
}

#[derive(Debug, Copy, Clone)]
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub enum GpuProverInstanceStatus {
// The instance is available for processing.
Available,
Expand All @@ -215,3 +215,17 @@ pub enum GpuProverInstanceStatus {
// The instance is not alive anymore.
Dead,
}

impl FromStr for GpuProverInstanceStatus {
type Err = ();

fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"available" => Ok(Self::Available),
"full" => Ok(Self::Full),
"reserved" => Ok(Self::Reserved),
"dead" => Ok(Self::Dead),
_ => Err(()),
}
}
}
2 changes: 2 additions & 0 deletions core/lib/config/src/configs/fri_prover.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ pub struct FriProverConfig {
pub queue_capacity: usize,
pub witness_vector_receiver_port: u16,
pub zone_read_url: String,
pub availability_check_interval_in_secs: u32,

// whether to write to public GCS bucket for https://github.com/matter-labs/era-boojum-validator-cli
pub shall_save_to_public_bucket: bool,
pub object_store: Option<ObjectStoreConfig>,
Expand Down
17 changes: 12 additions & 5 deletions core/lib/config/src/configs/house_keeper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,20 @@ pub struct HouseKeeperConfig {
pub prover_db_pool_size: u32,
pub proof_compressor_job_retrying_interval_ms: u64,
pub proof_compressor_stats_reporting_interval_ms: u64,
pub prover_job_archiver_reporting_interval_ms: Option<u64>,
pub prover_job_archiver_archiving_interval_secs: Option<u64>,
pub prover_job_archiver_archiving_interval_ms: Option<u64>,
pub prover_job_archiver_archive_after_secs: Option<u64>,
pub fri_gpu_prover_archiver_archiving_interval_ms: Option<u64>,
pub fri_gpu_prover_archiver_archive_after_secs: Option<u64>,
}

impl HouseKeeperConfig {
pub fn prover_job_archiver_enabled(&self) -> bool {
self.prover_job_archiver_reporting_interval_ms.is_some()
&& self.prover_job_archiver_archiving_interval_secs.is_some()
pub fn prover_job_archiver_params(&self) -> Option<(u64, u64)> {
self.prover_job_archiver_archiving_interval_ms
.zip(self.prover_job_archiver_archive_after_secs)
}

pub fn fri_gpu_prover_archiver_params(&self) -> Option<(u64, u64)> {
self.fri_gpu_prover_archiver_archiving_interval_ms
.zip(self.fri_gpu_prover_archiver_archive_after_secs)
}
}
7 changes: 5 additions & 2 deletions core/lib/config/src/testonly.rs
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,7 @@ impl Distribution<configs::FriProverConfig> for EncodeDist {
witness_vector_receiver_port: self.sample(rng),
zone_read_url: self.sample(rng),
shall_save_to_public_bucket: self.sample(rng),
availability_check_interval_in_secs: self.sample(rng),
object_store: self.sample(rng),
}
}
Expand Down Expand Up @@ -563,8 +564,10 @@ impl Distribution<configs::house_keeper::HouseKeeperConfig> for EncodeDist {
witness_generator_job_retrying_interval_ms: self.sample(rng),
proof_compressor_job_retrying_interval_ms: self.sample(rng),
proof_compressor_stats_reporting_interval_ms: self.sample(rng),
prover_job_archiver_reporting_interval_ms: self.sample(rng),
prover_job_archiver_archiving_interval_secs: self.sample(rng),
prover_job_archiver_archiving_interval_ms: self.sample(rng),
prover_job_archiver_archive_after_secs: self.sample(rng),
fri_gpu_prover_archiver_archiving_interval_ms: self.sample(rng),
fri_gpu_prover_archiver_archive_after_secs: self.sample(rng),
}
}
}
Expand Down

This file was deleted.

2 changes: 2 additions & 0 deletions core/lib/env_config/src/fri_prover.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ mod tests {
},
max_retries: 5,
}),
availability_check_interval_in_secs: 1_800,
}
}

Expand All @@ -65,6 +66,7 @@ mod tests {
FRI_PROVER_WITNESS_VECTOR_RECEIVER_PORT="3316"
FRI_PROVER_ZONE_READ_URL="http://metadata.google.internal/computeMetadata/v1/instance/zone"
FRI_PROVER_SHALL_SAVE_TO_PUBLIC_BUCKET=true
FRI_PROVER_AVAILABILITY_CHECK_INTERVAL_IN_SECS="1800"
OBJECT_STORE_BUCKET_BASE_URL="/base/url"
OBJECT_STORE_MODE="GCSWithCredentialFile"
OBJECT_STORE_GCS_CREDENTIAL_FILE_PATH="/path/to/credentials.json"
Expand Down
14 changes: 10 additions & 4 deletions core/lib/env_config/src/house_keeper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,12 @@ mod tests {
prover_db_pool_size: 2,
proof_compressor_job_retrying_interval_ms: 30_000,
proof_compressor_stats_reporting_interval_ms: 10_000,
prover_job_archiver_reporting_interval_ms: Some(1_800_000),
prover_job_archiver_archiving_interval_secs: Some(172_800),
prover_job_archiver_archiving_interval_ms: Some(1_800_000),
prover_job_archiver_archive_after_secs: Some(172_800),
// 24 hours
fri_gpu_prover_archiver_archiving_interval_ms: Some(86_400_000),
// 48 hours
fri_gpu_prover_archiver_archive_after_secs: Some(172_800),
}
}

Expand All @@ -48,8 +52,10 @@ mod tests {
HOUSE_KEEPER_PROVER_STATS_REPORTING_INTERVAL_MS="5000"
HOUSE_KEEPER_PROOF_COMPRESSOR_STATS_REPORTING_INTERVAL_MS="10000"
HOUSE_KEEPER_PROOF_COMPRESSOR_JOB_RETRYING_INTERVAL_MS="30000"
HOUSE_KEEPER_PROVER_JOB_ARCHIVER_REPORTING_INTERVAL_MS="1800000"
HOUSE_KEEPER_PROVER_JOB_ARCHIVER_ARCHIVING_INTERVAL_SECS="172800"
HOUSE_KEEPER_PROVER_JOB_ARCHIVER_ARCHIVING_INTERVAL_MS="1800000"
HOUSE_KEEPER_PROVER_JOB_ARCHIVER_ARCHIVE_AFTER_SECS="172800"
HOUSE_KEEPER_FRI_GPU_PROVER_ARCHIVER_ARCHIVING_INTERVAL_MS="86400000"
HOUSE_KEEPER_FRI_GPU_PROVER_ARCHIVER_ARCHIVE_AFTER_SECS="172800"
"#;
lock.set_env(config);

Expand Down
22 changes: 14 additions & 8 deletions core/lib/protobuf_config/src/house_keeper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,13 @@ impl ProtoRepr for proto::HouseKeeper {
.context("proof_compressor_stats_reporting_interval_ms")?,

// TODO(PLA-862): Make these 2 variables required
prover_job_archiver_reporting_interval_ms: self
.prover_job_archiver_reporting_interval_ms,
prover_job_archiver_archiving_interval_secs: self
.prover_job_archiver_archiving_interval_secs,
prover_job_archiver_archiving_interval_ms: self
.prover_job_archiver_archiving_interval_ms,
prover_job_archiver_archive_after_secs: self.prover_job_archiver_archive_after_secs,
fri_gpu_prover_archiver_archiving_interval_ms: self
.fri_gpu_prover_archiver_archiving_interval_ms,
fri_gpu_prover_archiver_archive_after_secs: self
.fri_gpu_prover_archiver_archive_after_secs,
})
}

Expand Down Expand Up @@ -73,10 +76,13 @@ impl ProtoRepr for proto::HouseKeeper {
proof_compressor_stats_reporting_interval_ms: Some(
this.proof_compressor_stats_reporting_interval_ms,
),
prover_job_archiver_reporting_interval_ms: this
.prover_job_archiver_reporting_interval_ms,
prover_job_archiver_archiving_interval_secs: this
.prover_job_archiver_archiving_interval_secs,
prover_job_archiver_archiving_interval_ms: this
.prover_job_archiver_archiving_interval_ms,
prover_job_archiver_archive_after_secs: this.prover_job_archiver_archive_after_secs,
fri_gpu_prover_archiver_archiving_interval_ms: this
.fri_gpu_prover_archiver_archiving_interval_ms,
fri_gpu_prover_archiver_archive_after_secs: this
.fri_gpu_prover_archiver_archive_after_secs,
}
}
}
26 changes: 14 additions & 12 deletions core/lib/protobuf_config/src/proto/house_keeper.proto
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,18 @@ syntax = "proto3";
package zksync.config.house_keeper;

message HouseKeeper {
optional uint64 l1_batch_metrics_reporting_interval_ms = 1; // required; ms
optional uint64 gpu_prover_queue_reporting_interval_ms = 2; // required; ms
optional uint64 prover_job_retrying_interval_ms = 3; // required; ms
optional uint64 prover_stats_reporting_interval_ms = 4; // required ms
optional uint64 witness_job_moving_interval_ms = 5; // required; ms
optional uint64 witness_generator_stats_reporting_interval_ms = 6; // required; ms
optional uint64 witness_generator_job_retrying_interval_ms = 9; // required; ms
optional uint32 prover_db_pool_size = 10; // required
optional uint64 proof_compressor_job_retrying_interval_ms = 12; // required; ms
optional uint64 proof_compressor_stats_reporting_interval_ms = 13; // required; ms
optional uint64 prover_job_archiver_reporting_interval_ms = 14; // optional; ms
optional uint64 prover_job_archiver_archiving_interval_secs = 15; // optional; seconds
optional uint64 l1_batch_metrics_reporting_interval_ms = 1; // required; ms
optional uint64 gpu_prover_queue_reporting_interval_ms = 2; // required; ms
optional uint64 prover_job_retrying_interval_ms = 3; // required; ms
optional uint64 prover_stats_reporting_interval_ms = 4; // required ms
optional uint64 witness_job_moving_interval_ms = 5; // required; ms
optional uint64 witness_generator_stats_reporting_interval_ms = 6; // required; ms
optional uint64 witness_generator_job_retrying_interval_ms = 9; // required; ms
optional uint32 prover_db_pool_size = 10; // required
optional uint64 proof_compressor_job_retrying_interval_ms = 12; // required; ms
optional uint64 proof_compressor_stats_reporting_interval_ms = 13; // required; ms
optional uint64 prover_job_archiver_archiving_interval_ms = 14; // optional; ms
optional uint64 prover_job_archiver_archive_after_secs = 15; // optional; seconds
optional uint64 fri_gpu_prover_archiver_archiving_interval_ms = 16; // optional; ms
optional uint64 fri_gpu_prover_archiver_archive_after_secs = 17; // optional; seconds
}
Loading

0 comments on commit a970629

Please sign in to comment.