diff --git a/core/store/src/db.rs b/core/store/src/db.rs index cbd4d81e152..6eb32d4fc9e 100644 --- a/core/store/src/db.rs +++ b/core/store/src/db.rs @@ -16,6 +16,7 @@ use near_primitives::version::DbVersion; use crate::db::refcount::merge_refcounted_records; +use rocksdb::checkpoint::Checkpoint; use std::path::Path; use std::sync::atomic::Ordering; @@ -872,6 +873,11 @@ impl RocksDB { Ok(()) } } + + /// Creates a Checkpoint object that can be used to actually create a checkpoint on disk. + pub fn checkpoint(&self) -> Result { + Checkpoint::new(&self.db).map_err(|err| DBError(err)) + } } fn available_space + std::fmt::Debug>( diff --git a/nearcore/src/config.rs b/nearcore/src/config.rs index 9c2909f5190..73631643c93 100644 --- a/nearcore/src/config.rs +++ b/nearcore/src/config.rs @@ -322,6 +322,10 @@ fn default_trie_viewer_state_size_limit() -> Option { Some(50_000) } +fn default_use_checkpoints_for_db_migration() -> bool { + true +} + #[derive(Serialize, Deserialize, Clone, Debug)] pub struct Consensus { /// Minimum number of peers to start syncing. @@ -435,6 +439,15 @@ pub struct Config { /// If set, overrides value in genesis configuration. #[serde(skip_serializing_if = "Option::is_none")] pub max_gas_burnt_view: Option, + /// Checkpoints let the user recover from interrupted DB migrations. + #[serde(default = "default_use_checkpoints_for_db_migration")] + pub use_db_migration_snapshot: bool, + /// Location of the DB checkpoint for the DB migrations. This can be one of the following: + /// * Empty, the checkpoint will be created in the database location, i.e. '$home/data'. + /// * Absolute path that points to an existing directory. The checkpoint will be a sub-directory in that directory. + /// For example, setting "use_db_migration_snapshot" to "/tmp/" will create a directory "/tmp/db_migration_snapshot" and populate it with the database files. + #[serde(skip_serializing_if = "Option::is_none")] + pub db_migration_snapshot_path: Option, } impl Default for Config { @@ -461,6 +474,8 @@ impl Default for Config { view_client_throttle_period: default_view_client_throttle_period(), trie_viewer_state_size_limit: default_trie_viewer_state_size_limit(), max_gas_burnt_view: None, + db_migration_snapshot_path: None, + use_db_migration_snapshot: true, } } } diff --git a/nearcore/src/lib.rs b/nearcore/src/lib.rs index 3d30a155660..b175258d77e 100644 --- a/nearcore/src/lib.rs +++ b/nearcore/src/lib.rs @@ -1,26 +1,27 @@ -use std::fs; -use std::path::{Path, PathBuf}; -use std::sync::Arc; - +pub use crate::config::{init_configs, load_config, load_test_config, NearConfig, NEAR_BASE}; +use crate::migrations::{ + migrate_12_to_13, migrate_18_to_19, migrate_19_to_20, migrate_22_to_23, migrate_23_to_24, + migrate_24_to_25, migrate_30_to_31, +}; +pub use crate::runtime::NightshadeRuntime; +pub use crate::shard_tracker::TrackedConfig; use actix::{Actor, Addr, Arbiter}; use actix_rt::ArbiterHandle; use actix_web; use anyhow::Context; -#[cfg(feature = "performance_stats")] -use near_rust_allocator_proxy::reset_memory_usage_max; -use tracing::{error, info, trace}; - use near_chain::ChainGenesis; #[cfg(feature = "test_features")] use near_client::AdversarialControls; use near_client::{start_client, start_view_client, ClientActor, ViewClientActor}; - use near_network::routing::start_routing_table_actor; use near_network::test_utils::NetworkRecipient; use near_network::PeerManagerActor; use near_primitives::network::PeerId; #[cfg(feature = "rosetta_rpc")] use near_rosetta_rpc::start_rosetta_rpc; +#[cfg(feature = "performance_stats")] +use near_rust_allocator_proxy::reset_memory_usage_max; +use near_store::db::RocksDB; use near_store::migrations::{ fill_col_outcomes_by_hash, fill_col_transaction_refcount, get_store_version, migrate_10_to_11, migrate_11_to_12, migrate_13_to_14, migrate_14_to_15, migrate_17_to_18, migrate_20_to_21, @@ -29,14 +30,10 @@ use near_store::migrations::{ }; use near_store::{create_store, Store}; use near_telemetry::TelemetryActor; - -pub use crate::config::{init_configs, load_config, load_test_config, NearConfig, NEAR_BASE}; -use crate::migrations::{ - migrate_12_to_13, migrate_18_to_19, migrate_19_to_20, migrate_22_to_23, migrate_23_to_24, - migrate_24_to_25, migrate_30_to_31, -}; -pub use crate::runtime::NightshadeRuntime; -pub use crate::shard_tracker::TrackedConfig; +use std::fs; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use tracing::{error, info, trace}; pub mod append_only_map; pub mod config; @@ -74,6 +71,46 @@ pub fn get_default_home() -> PathBuf { PathBuf::default() } +/// Returns the path of the DB checkpoint. +/// Default location is the same as the database location: `path`. +fn db_checkpoint_path(path: &Path, near_config: &NearConfig) -> PathBuf { + let root_path = + if let Some(db_migration_snapshot_path) = &near_config.config.db_migration_snapshot_path { + assert!( + db_migration_snapshot_path.is_absolute(), + "'db_migration_snapshot_path' must be an absolute path to an existing directory." + ); + db_migration_snapshot_path.clone() + } else { + path.to_path_buf() + }; + root_path.join(DB_CHECKPOINT_NAME) +} + +const DB_CHECKPOINT_NAME: &str = "db_migration_snapshot"; + +/// Creates a consistent DB checkpoint and returns its path. +/// By default it creates checkpoints in the DB directory, but can be overridden by the config. +fn create_db_checkpoint(path: &Path, near_config: &NearConfig) -> Result { + let checkpoint_path = db_checkpoint_path(path, near_config); + if checkpoint_path.exists() { + return Err(anyhow::anyhow!( + "Detected an existing database migration snapshot: '{}'.\n\ + Probably a database migration got interrupted and your database is corrupted.\n\ + Please replace the contents of '{}' with data from that checkpoint, delete the checkpoint and try again.", + checkpoint_path.display(), + path.display())); + } + + let db = RocksDB::new(path)?; + let checkpoint = db.checkpoint()?; + info!(target: "near", "Creating a database migration snapshot in '{}'", checkpoint_path.display()); + checkpoint.create_checkpoint(&checkpoint_path)?; + info!(target: "near", "Created a database migration snapshot in '{}'", checkpoint_path.display()); + + Ok(checkpoint_path) +} + /// Function checks current version of the database and applies migrations to the database. pub fn apply_store_migrations(path: &Path, near_config: &NearConfig) { let db_version = get_store_version(path); @@ -81,10 +118,37 @@ pub fn apply_store_migrations(path: &Path, near_config: &NearConfig) { error!(target: "near", "DB version {} is created by a newer version of neard, please update neard or delete data", db_version); std::process::exit(1); } + if db_version == near_primitives::version::DB_VERSION { return; } + // Before starting a DB migration, create a consistent snapshot of the database. If a migration + // fails, it can be used to quickly restore the database to its original state. + let checkpoint_path = if near_config.config.use_db_migration_snapshot { + match create_db_checkpoint(path, near_config) { + Ok(checkpoint_path) => { + info!(target: "near", "Created a DB checkpoint before a DB migration: '{}'. Please recover from this checkpoint if the migration gets interrupted.", checkpoint_path.display()); + Some(checkpoint_path) + } + Err(err) => { + panic!( + "Failed to create a database migration snapshot:\n\ + {}\n\ + Please consider fixing this issue and retrying.\n\ + You can change the location of database migration snapshots by adjusting `config.json`:\n\ + \t\"db_migration_snapshot_path\": \"/absolute/path/to/existing/dir\",\n\ + Alternatively, you can disable database migration snapshots in `config.json`:\n\ + \t\"use_db_migration_snapshot\": false,\n\ + ", + err + ); + } + } + } else { + None + }; + // Add migrations here based on `db_version`. if db_version <= 1 { // version 1 => 2: add gc column @@ -274,6 +338,25 @@ pub fn apply_store_migrations(path: &Path, near_config: &NearConfig) { let db_version = get_store_version(path); debug_assert_eq!(db_version, near_primitives::version::DB_VERSION); } + + // DB migration was successful, remove the checkpoint to avoid it taking up precious disk space. + if let Some(checkpoint_path) = checkpoint_path { + info!(target: "near", "Deleting the database migration snapshot at '{}'", checkpoint_path.display()); + match std::fs::remove_dir_all(&checkpoint_path) { + Ok(_) => { + info!(target: "near", "Deleted the database migration snapshot at '{}'", checkpoint_path.display()); + } + Err(err) => { + error!( + "Failed to delete the database migration snapshot at '{}'.\n\ + \tError: {:#?}.\n\ + \n\ + Please delete the database migration snapshot manually before the next start of the node.", + checkpoint_path.display(), + err); + } + } + } } pub fn init_and_migrate_store(home_dir: &Path, near_config: &NearConfig) -> Store {