diff --git a/deploy/README.adoc b/deploy/README.adoc index eaf17ca782a..b88fc4d03cc 100644 --- a/deploy/README.adoc +++ b/deploy/README.adoc @@ -83,42 +83,58 @@ all the dependencies for Omicron installed. Following the *prerequisites* in the https://github.com/oxidecomputer/omicron/#build-and-run[Build and run] section of the main Omicron README is probably a good idea. -=== Command Based Workflow +==== Update `config-rss.toml` -==== Build thing-flinger on client -`thing-flinger` is part of the `omicron-package` crate. +Currently rack setup is driven by a configuration file that lives at +`smf/sled-agent/config-rss.toml` in the root of this repository. The committed +configuration of that file contains a single `[[requests]]` entry (with many +services inside it), which means it will start services on only one sled. To +start services (e.g., nexus) on multiple sleds, add additional entries to that +configuration file before proceeding. -`cargo build -p omicron-package` +=== Command Based Workflow ==== sync -Copy your source code to the builder. Note that this copies over your `.git` subdirectory on purpose so -that a branch can be configured for building with the `git_treeish` field in the toml `builder` -table. +Copy your source code to the builder. + +`cargo run --bin thing-flinger -- -c sync` -`./target/debug/thing-flinger -c sync` +==== Install Prerequisites +Install necessary build and runtime dependencies (including downloading prebuilt +binaries like Clickhouse and CockroachDB) on the builder and all deployment +targets. This step only needs to be performed once, absent any changes to the +dependencies, but is idempotent so may be run multiple times. -==== build-minimal -Build necessary parts of omicron on the builder, required for future use by thing-flinger. +`cargo run --bin thing-flinger -- -c install-prereqs` -`./target/debug/thing-flinger -c build-minimal` +==== check (optional) +Run `cargo check` on the builder against the copy of `omicron` that was sync'd +to it in the previous step. -==== package +`cargo run --bin thing-flinger -- -c build check` + +==== package Build and package omicron using `omicron-package` on the builder. -`./target/debug/thing-flinger -c package` +`cargo run --bin thing-flinger -- -c build package` ==== overlay Create files that are unique to each deployment server. -`./target/debug/thing-flinger -c overlay` +`cargo run --bin thing-flinger -- -c overlay` -==== install +==== install Install omicron to all machines, in parallel. This consists of copying the packaged omicron tarballs along with overlay files, and omicron-package and its manifest to a `staging` directory on each deployment server, and then running omicron-package, installing overlay files, and restarting services. -`./target/debug/thing-flinger -c install` +`cargo run --bin thing-flinger -- -c deploy install` + +==== uninstall +Uninstall omicron from all machines. + +`cargo run --bin thing-flinger -- -c deploy uninstall` === Current Limitations @@ -140,3 +156,67 @@ effort to use securely. This particular implementation wraps the openssh ssh cli `std::process::Command`, rather than using the `ssh2` crate, because ssh2, as a wrapper around `libssh`, does not support agent-forwarding. +== Notes on Using VMs as Deployed Servers on a Linux Host + +TODO: This section should be fleshed out more and potentially lifted to its own +document; for now this is a collection of rough notes. + +--- + +It's possible to use a Linux libvirt host running multiple helios VMs as the +builder/deployment server targets, but it requires some additional setup beyond +[`helios-engvm`](https://github.com/oxidecomputer/helios-engvm). + +`thing-flinger` does not have any support for running the +`tools/create_virtual_hardware.sh` script; this will need to be done by hand on +each VM. + +--- + +To enable communication between the VMs over their IPv6 bootstrap networks: + +1. Enable IPv6 and DHCP on the virtual network libvirt uses for the VMs; e.g., + +```xml + + + + + +``` + +After booting the VMs with this enabled, they should be able to ping each other +over their acquired IPv6 addresses, but connecting to each other over the +`bootstrap6` interface that sled-agent creates will fail. + +2. Explicitly add routes in the Linux host for the `bootstrap6` addresses, +specifying the virtual interface libvirt created that is used by the VMs. + +``` +bash% sudo ip -6 route add fdb0:5254:13:7331::1/64 dev virbr1 +bash% sudo ip -6 route add fdb0:5254:f0:acfd::1/64 dev virbr1 +``` + +3. Once the sled-agents advance sufficiently to set up `sled6` interfaces, +routes need to be added for them both in the Linux host and in the Helios VMs. +Assuming two sleds with these interfaces: + +``` +# VM 1 +vioif0/sled6 static ok fd00:1122:3344:1::1/64 +# VM 2 +vioif0/sled6 static ok fd00:1122:3344:2::1/64 +``` + +The Linux host needs to be told to route that subnet to the appropriate virtual +interface: + +``` +bash% ip -6 route add fd00:1122:3344::1/48 dev virbr1 +``` + +and each Helios VM needs to be told to route that subnet to the host gateway: + +``` +vm% pfexec route add -inet6 fd00:1122:3344::/48 $IPV6_HOST_GATEWAY_ADDR +``` diff --git a/deploy/src/bin/deployment-example.toml b/deploy/src/bin/deployment-example.toml index 7121911dd3a..95311ccb054 100644 --- a/deploy/src/bin/deployment-example.toml +++ b/deploy/src/bin/deployment-example.toml @@ -15,7 +15,9 @@ server = "foo" omicron_path = "/remote/path/to/omicron" [deployment] -servers = ["foo", "bar"] +# which server is responsible for running the rack setup service; must +# refer to one of the `servers` in the servers table +rss_server = "foo" rack_secret_threshold = 2 # Location where files to install will be placed before running # `omicron-package install` diff --git a/deploy/src/bin/thing-flinger.rs b/deploy/src/bin/thing-flinger.rs index c83c6f2a394..68f4363bee0 100644 --- a/deploy/src/bin/thing-flinger.rs +++ b/deploy/src/bin/thing-flinger.rs @@ -32,7 +32,7 @@ struct Server { #[derive(Deserialize, Debug)] struct Deployment { - servers: BTreeSet, + rss_server: String, rack_secret_threshold: usize, staging_dir: PathBuf, } @@ -78,6 +78,10 @@ enum SubCommand { servers: Option>, }, + /// Install necessary prerequisites on the "builder" server and all "deploy" + /// servers. + InstallPrereqs, + /// Sync our local source to the build host Sync, @@ -153,6 +157,19 @@ fn do_exec( Ok(()) } +// start an `rsync` command with args common to all our uses +fn rsync_common() -> Command { + let mut cmd = Command::new("rsync"); + cmd.arg("-az") + .arg("-e") + .arg("ssh") + .arg("--delete") + .arg("--progress") + .arg("--out-format") + .arg("File changed: %o %t %f"); + cmd +} + fn do_sync(config: &Config) -> Result<()> { let builder = config.servers.get(&config.builder.server).ok_or_else(|| { @@ -161,8 +178,17 @@ fn do_sync(config: &Config) -> Result<()> { // For rsync to copy from the source appropriately we must guarantee a // trailing slash. - let src = - format!("{}/", config.omicron_path.canonicalize()?.to_string_lossy()); + let src = format!( + "{}/", + config + .omicron_path + .canonicalize() + .with_context(|| format!( + "could not canonicalize {}", + config.omicron_path.display() + ))? + .to_string_lossy() + ); let dst = format!( "{}@{}:{}", builder.username, @@ -171,29 +197,26 @@ fn do_sync(config: &Config) -> Result<()> { ); println!("Synchronizing source files to: {}", dst); + let mut cmd = rsync_common(); - let mut cmd = Command::new("rsync"); - cmd.arg("-az") - .arg("-e") - .arg("ssh") - .arg("--delete") - .arg("--progress") - .arg("--exclude") + // exclude build and development environment artifacts + cmd.arg("--exclude") .arg("target/") .arg("--exclude") - .arg("out/") - .arg("--exclude") - .arg("/cockroachdb/") - .arg("--exclude") - .arg("/clickhouse/") + .arg("*.vdev") .arg("--exclude") .arg("*.swp") .arg("--exclude") .arg(".git/") - .arg("--out-format") - .arg("File changed: %o %t %f") - .arg(&src) - .arg(&dst); + .arg("--exclude") + .arg("out/"); + + // exclude `config-rss.toml`, which needs to be sent to only one target + // system. we handle this in `do_overlay` below. + cmd.arg("--exclude").arg("**/config-rss.toml"); + + // finish with src/dst + cmd.arg(&src).arg(&dst); let status = cmd.status().context(format!("Failed to run command: ({:?})", cmd))?; if !status.success() { @@ -203,6 +226,76 @@ fn do_sync(config: &Config) -> Result<()> { Ok(()) } +fn do_install_prereqs(config: &Config) -> Result<()> { + // we need to rsync `./tools/*` to each of the deployment targets (the + // "builder" already has it via `do_sync()`), and then run `pfxec + // tools/install_prerequisites.sh` on each system. + let src = format!( + // the `./` here is load-bearing; it interacts with `--relative` to tell + // rsync to create `tools` but none of its parents + "{}/./tools/", + config + .omicron_path + .canonicalize() + .with_context(|| format!( + "could not canonicalize {}", + config.omicron_path.display() + ))? + .to_string_lossy() + ); + let partial_cmd = || { + let mut cmd = rsync_common(); + cmd.arg("--relative"); + cmd.arg(&src); + cmd + }; + + for server in config.servers.values() { + let dst = format!( + "{}@{}:{}", + server.username, + server.addr, + config.deployment.staging_dir.to_str().unwrap() + ); + let mut cmd = partial_cmd(); + cmd.arg(&dst); + let status = cmd + .status() + .context(format!("Failed to run command: ({:?})", cmd))?; + if !status.success() { + return Err(FlingError::FailedSync { src, dst }.into()); + } + } + + // run install_prereqs on each server + let builder = &config.servers[&config.builder.server]; + let build_server = (builder, &config.builder.omicron_path); + let all_servers = std::iter::once(build_server).chain( + config.servers.iter().filter_map(|(name, server)| { + // skip running prereq installing on a deployment target if it is + // also the builder, because we're already running it on the builder + if *name == config.builder.server { + None + } else { + Some((server, &config.deployment.staging_dir)) + } + }), + ); + + for (server, root_path) in all_servers { + // -y: assume yes instead of prompting + // -p: skip check that deps end up in $PATH + let cmd = format!( + "cd {} && mkdir -p out && pfexec ./tools/install_prerequisites.sh -y -p", + root_path.display() + ); + println!("install prerequisites on {}", server.addr); + ssh_exec(server, &cmd, false)?; + } + + Ok(()) +} + // Build omicron-package and omicron-deploy on the builder // // We need to build omicron-deploy for overlay file generation @@ -262,10 +355,8 @@ fn do_uninstall( ) -> Result<()> { let mut deployment_src = PathBuf::from(&config.deployment.staging_dir); deployment_src.push(&artifact_dir); - for server_name in &config.deployment.servers { - let builder = &config.servers[&config.builder.server]; - let server = &config.servers[server_name]; - + let builder = &config.servers[&config.builder.server]; + for server in config.servers.values() { copy_omicron_package_binary_to_staging(config, builder, server)?; // Run `omicron-package uninstall` on the deployment server @@ -293,7 +384,7 @@ fn do_install(config: &Config, artifact_dir: &Path, install_dir: &Path) { Vec::<(String, ScopedJoinHandle<'_, Result<()>>)>::new(); // Spawn a thread for each server install - for server_name in &config.deployment.servers { + for server_name in config.servers.keys() { handles.push(( server_name.to_owned(), s.spawn(move |_| -> Result<()> { @@ -334,30 +425,61 @@ fn do_install(config: &Config, artifact_dir: &Path, install_dir: &Path) { } fn do_overlay(config: &Config) -> Result<()> { + let builder = &config.servers[&config.builder.server]; let mut root_path = PathBuf::from(&config.builder.omicron_path); // TODO: This needs to match the artifact_dir in `package` root_path.push("out/overlay"); - let server_dirs = dir_per_deploy_server(config, &root_path); - let builder = &config.servers[&config.builder.server]; - overlay_sled_agent(&builder, config, &server_dirs) + + // Build a list of directories for each server to be deployed and tag which + // one is the server to run RSS; e.g., for servers ["foo", "bar", "baz"] + // with root_path "/my/path", we produce + // [ + // "/my/path/foo/sled-agent/pkg", + // "/my/path/bar/sled-agent/pkg", + // "/my/path/baz/sled-agent/pkg", + // ] + // As we're doing so, record which directory is the one for the server that + // will run RSS. + let mut rss_server_dir = None; + let sled_agent_dirs = config + .servers + .keys() + .map(|server_name| { + let mut dir = root_path.clone(); + dir.push(server_name); + dir.push("sled-agent/pkg"); + if *server_name == config.deployment.rss_server { + rss_server_dir = Some(dir.clone()); + } + dir + }) + .collect::>(); + + // we know exactly one of the servers matches `rss_server` from our config + // validation, so we can unwrap here + let rss_server_dir = rss_server_dir.unwrap(); + + overlay_sled_agent(builder, config, &sled_agent_dirs)?; + overlay_rss_config(builder, config, &rss_server_dir)?; + + Ok(()) } fn overlay_sled_agent( - server: &Server, + builder: &Server, config: &Config, - server_dirs: &[PathBuf], + sled_agent_dirs: &[PathBuf], ) -> Result<()> { - let sled_agent_dirs: Vec = server_dirs + // Send SSH command to create directories on builder and generate secret + // shares. + + // TODO do we need any escaping here? this will definitely break if any dir + // names have spaces + let dirs = sled_agent_dirs .iter() - .map(|dir| { - let mut dir = PathBuf::from(dir); - dir.push("sled-agent/pkg"); - dir - }) - .collect(); + .map(|dir| format!("{} ", dir.display())) + .collect::(); - // Create directories on builder - let dirs = dir_string(&sled_agent_dirs); let cmd = format!( "sh -c 'for dir in {}; do mkdir -p $dir; done' && \ cd {} && \ @@ -368,7 +490,38 @@ fn overlay_sled_agent( config.deployment.rack_secret_threshold, dirs ); - ssh_exec(server, &cmd, false) + ssh_exec(builder, &cmd, false) +} + +fn overlay_rss_config( + builder: &Server, + config: &Config, + rss_server_dir: &Path, +) -> Result<()> { + // Sync `config-rss.toml` to the directory for the RSS server on the + // builder. + let src = config.omicron_path.join("smf/sled-agent/config-rss.toml"); + let dst = format!( + "{}@{}:{}", + builder.username, + builder.addr, + rss_server_dir.display() + ); + + let mut cmd = rsync_common(); + cmd.arg(&src).arg(&dst); + + let status = + cmd.status().context(format!("Failed to run command: ({:?})", cmd))?; + if !status.success() { + return Err(FlingError::FailedSync { + src: src.to_string_lossy().to_string(), + dst, + } + .into()); + } + + Ok(()) } fn single_server_install( @@ -381,16 +534,25 @@ fn single_server_install( ) -> Result<()> { let server = &config.servers[server_name]; - println!("COPYING packages from builder -> deploy server"); + println!( + "COPYING packages from builder ({}) -> deploy server ({})", + builder.addr, server_name + ); copy_package_artifacts_to_staging(config, pkg_dir, builder, server)?; - println!("COPYING deploy tool from builder -> deploy server"); + println!( + "COPYING deploy tool from builder ({}) -> deploy server ({})", + builder.addr, server_name + ); copy_omicron_package_binary_to_staging(config, builder, server)?; - println!("COPYING manifest from builder -> deploy server"); + println!( + "COPYING manifest from builder ({}) -> deploy server ({})", + builder.addr, server_name + ); copy_package_manifest_to_staging(config, builder, server)?; - println!("INSTALLING packages on deploy server"); + println!("INSTALLING packages on deploy server ({})", server_name); run_omicron_package_install_from_staging( config, server, @@ -398,7 +560,10 @@ fn single_server_install( &install_dir, )?; - println!("COPYING overlay files from builder -> deploy server"); + println!( + "COPYING overlay files from builder ({}) -> deploy server ({})", + builder.addr, server_name + ); copy_overlay_files_to_staging( config, pkg_dir, @@ -407,10 +572,10 @@ fn single_server_install( server_name, )?; - println!("INSTALLING overlay files into the install directory of the deploy server"); + println!("INSTALLING overlay files into the install directory of the deploy server ({})", server_name); install_overlay_files_from_staging(config, server, &install_dir)?; - println!("RESTARTING services on the deploy server"); + println!("RESTARTING services on the deploy server ({})", server_name); restart_services(server) } @@ -427,7 +592,11 @@ fn copy_package_artifacts_to_staging( ) -> Result<()> { let cmd = format!( "rsync -avz -e 'ssh -o StrictHostKeyChecking=no' \ - --exclude overlay/ {} {}@{}:{}", + --include 'out/' \ + --include 'out/*.tar' \ + --include 'out/*.tar.gz' \ + --exclude '*' \ + {} {}@{}:{}", pkg_dir, destination.username, destination.addr, @@ -535,29 +704,6 @@ fn restart_services(destination: &Server) -> Result<()> { ssh_exec(destination, "svcadm restart sled-agent", false) } -fn dir_string(dirs: &[PathBuf]) -> String { - dirs.iter().map(|dir| dir.to_string_lossy().to_string() + " ").collect() -} - -// For each server to be deployed, append the server name to `root`. -// -// Example (for servers "foo", "bar", "baz"): -// -// dir_per_deploy_server(&config, "/my/path") -> -// vec!["/my/path/foo", "/my/path/bar", "/my/path/baz"] -fn dir_per_deploy_server(config: &Config, root: &Path) -> Vec { - config - .deployment - .servers - .iter() - .map(|server_dir| { - let mut dir = PathBuf::from(root); - dir.push(server_dir); - dir - }) - .collect() -} - fn ssh_exec( server: &Server, remote_cmd: &str, @@ -623,10 +769,11 @@ fn validate(config: &Config) -> Result<(), FlingError> { "deployment.staging_dir", )?; - validate_servers(&config.deployment.servers, &config.servers)?; - validate_servers( - &BTreeSet::from([config.builder.server.clone()]), + &BTreeSet::from([ + config.builder.server.clone(), + config.deployment.rss_server.clone(), + ]), &config.servers, ) } @@ -642,6 +789,7 @@ fn main() -> Result<()> { do_exec(&config, cmd, servers)?; } SubCommand::Sync => do_sync(&config)?, + SubCommand::InstallPrereqs => do_install_prereqs(&config)?, SubCommand::Builder(BuildCommand::Package { artifact_dir }) => { do_package(&config, artifact_dir)?; } diff --git a/package/src/bin/omicron-package.rs b/package/src/bin/omicron-package.rs index 5858a22287c..d3daf67d67d 100644 --- a/package/src/bin/omicron-package.rs +++ b/package/src/bin/omicron-package.rs @@ -124,12 +124,19 @@ async fn do_build(config: &Config) -> Result<()> { // Calculates the SHA256 digest for a file. async fn get_sha256_digest(path: &PathBuf) -> Result { - let mut reader = BufReader::new(tokio::fs::File::open(&path).await?); + let mut reader = BufReader::new( + tokio::fs::File::open(&path) + .await + .with_context(|| format!("could not open {path:?}"))?, + ); let mut context = DigestContext::new(&SHA256); let mut buffer = [0; 1024]; loop { - let count = reader.read(&mut buffer).await?; + let count = reader + .read(&mut buffer) + .await + .with_context(|| format!("failed to read {path:?}"))?; if count == 0 { break; } else { @@ -170,21 +177,31 @@ async fn get_external_package( commit, path.as_path().file_name().unwrap().to_string_lossy(), ); - let response = reqwest::Client::new().get(url).send().await?; + let response = reqwest::Client::new() + .get(&url) + .send() + .await + .with_context(|| format!("failed to get {url}"))?; progress.set_length( response .content_length() .ok_or_else(|| anyhow!("Missing Content Length"))?, ); - let mut file = tokio::fs::File::create(path).await?; + let mut file = tokio::fs::File::create(&path) + .await + .with_context(|| format!("failed to create {path:?}"))?; let mut stream = response.bytes_stream(); let mut context = DigestContext::new(&SHA256); while let Some(chunk) = stream.next().await { - let chunk = chunk?; + let chunk = chunk.with_context(|| { + format!("failed reading response from {url}") + })?; // Update the running SHA digest context.update(&chunk); // Update the downloaded file - file.write_all(&chunk).await?; + file.write_all(&chunk) + .await + .with_context(|| format!("failed writing {path:?}"))?; // Record progress in the UI progress.increment(chunk.len().try_into().unwrap()); } @@ -255,7 +272,10 @@ async fn do_package(config: &Config, output_directory: &Path) -> Result<()> { progress.set_message("bundle package".to_string()); package .create_with_progress(&progress, &output_directory) - .await?; + .await + .with_context(|| { + format!("failed to create {package_name} in {output_directory:?}") + })?; progress.finish(); Ok(()) }, diff --git a/package/src/lib.rs b/package/src/lib.rs index 8bc557b042f..72be8c4d4bf 100644 --- a/package/src/lib.rs +++ b/package/src/lib.rs @@ -12,17 +12,21 @@ use thiserror::Error; /// Errors which may be returned when parsing the server configuration. #[derive(Error, Debug)] pub enum ParseError { - #[error("Cannot parse toml: {0}")] - Toml(#[from] toml::de::Error), - #[error("IO error: {0}")] - Io(#[from] std::io::Error), + #[error("Error deserializing toml from {path}: {err}")] + Toml { path: PathBuf, err: toml::de::Error }, + #[error("IO error: {message}: {err}")] + Io { message: String, err: std::io::Error }, } pub fn parse, C: DeserializeOwned>( path: P, ) -> Result { - let contents = std::fs::read_to_string(path.as_ref())?; - let cfg = toml::from_str::(&contents)?; + let path = path.as_ref(); + let contents = std::fs::read_to_string(path).map_err(|err| { + ParseError::Io { message: format!("failed reading {path:?}"), err } + })?; + let cfg = toml::from_str::(&contents) + .map_err(|err| ParseError::Toml { path: path.to_path_buf(), err })?; Ok(cfg) } diff --git a/tools/install_prerequisites.sh b/tools/install_prerequisites.sh index 7f3f4aed19d..c348c6d971a 100755 --- a/tools/install_prerequisites.sh +++ b/tools/install_prerequisites.sh @@ -19,13 +19,30 @@ function on_exit trap on_exit ERR -# Offers a confirmation prompt. +# Parse command line options: +# +# -y Assume "yes" intead of showing confirmation prompts. +ASSUME_YES="false" +SKIP_PATH_CHECK="false" +while getopts yp flag +do + case "${flag}" in + y) ASSUME_YES="true" ;; + p) SKIP_PATH_CHECK="true" ;; + esac +done + +# Offers a confirmation prompt, unless we were passed `-y`. # # Args: # $1: Text to be displayed function confirm { - read -r -p "$1 (y/n): " response + if [[ "${ASSUME_YES}" == "true" ]]; then + response=y + else + read -r -p "$1 (y/n): " response + fi case $response in [yY]) true @@ -151,7 +168,12 @@ function show_hint esac } -# Check all paths before returning an error. +# Check all paths before returning an error, unless we were told not too. +if [[ "$SKIP_PATH_CHECK" == "true" ]]; then + echo "All prerequisites installed successfully" + exit 0 +fi + ANY_PATH_ERROR="false" for command in "${expected_in_path[@]}"; do rc=0