-
Notifications
You must be signed in to change notification settings - Fork 62
Tunnel sled initialization requests through sprockets sessions #1128
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,6 +14,7 @@ use omicron_sled_agent::bootstrap::{ | |
| }; | ||
| use omicron_sled_agent::rack_setup::config::SetupServiceConfig as RssConfig; | ||
| use omicron_sled_agent::{config::Config as SledConfig, server as sled_server}; | ||
| use sp_sim::config::GimletConfig; | ||
| use std::net::SocketAddr; | ||
| use std::path::PathBuf; | ||
| use structopt::StructOpt; | ||
|
|
@@ -108,6 +109,20 @@ async fn do_run() -> Result<(), CmdError> { | |
| } else { | ||
| None | ||
| }; | ||
| let sp_config_path = { | ||
| let mut sp_config_path = config_path.clone(); | ||
| sp_config_path.pop(); | ||
| sp_config_path.push("config-sp.toml"); | ||
| sp_config_path | ||
| }; | ||
| let sp_config = if sp_config_path.exists() { | ||
| Some( | ||
| GimletConfig::from_file(sp_config_path) | ||
| .map_err(|e| CmdError::Failure(e.to_string()))?, | ||
| ) | ||
| } else { | ||
| None | ||
| }; | ||
|
|
||
| // Derive the bootstrap address from the data link's MAC address. | ||
| let link = config | ||
|
|
@@ -116,16 +131,47 @@ async fn do_run() -> Result<(), CmdError> { | |
| let bootstrap_address = bootstrap_address(link) | ||
| .map_err(|e| CmdError::Failure(e.to_string()))?; | ||
|
|
||
| // Are we going to simulate a local SP? If so: | ||
| // | ||
| // 1. The bootstrap dropshot server listens on localhost | ||
| // 2. A sprockets proxy listens on `bootstrap_address` (and relays | ||
| // incoming connections to the localhost dropshot server) | ||
| // | ||
| // If we're not simulating a local SP, we can't establish sprockets | ||
| // sessions, so we'll have the bootstrap dropshot server listen on | ||
| // `bootstrap_address` (and no sprockets proxy). | ||
| // | ||
| // TODO-security: With this configuration, dropshot itself is | ||
| // running plain HTTP and blindly trusting all connections from | ||
| // localhost. We have a similar sprockets proxy on the client side, | ||
| // where the proxy blindly trusts all connections from localhost | ||
| // (although the client-side proxy only runs while is being made, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo (missing word?): |
||
| // while our dropshot server is always listening). Can we secure | ||
| // these connections sufficiently? Other options include expanding | ||
| // dropshot/progenitor to allow a custom connection layer (supported | ||
| // by hyper, but not reqwest), keeping the sprockets proxy but using | ||
| // something other than TCP that we can lock down, or abandoning | ||
| // dropshot and using a bespoke protocol over a raw | ||
| // sprockets-encrypted TCP connection. | ||
| let (bootstrap_dropshot_addr, sprockets_proxy_bind_addr) = | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: These two values seem really tightly coupled; I wonder if they should be a distinct type. That being said, I understand passing things through the |
||
| if sp_config.is_some() { | ||
| ("[::1]:0".parse().unwrap(), Some(bootstrap_address)) | ||
| } else { | ||
| (SocketAddr::V6(bootstrap_address), None) | ||
| }; | ||
|
|
||
| // Configure and run the Bootstrap server. | ||
| let bootstrap_config = BootstrapConfig { | ||
| id: config.id, | ||
| dropshot: ConfigDropshot { | ||
| bind_address: SocketAddr::V6(bootstrap_address), | ||
| bind_address: bootstrap_dropshot_addr, | ||
| request_body_max_bytes: 1024 * 1024, | ||
| ..Default::default() | ||
| }, | ||
| log: config.log.clone(), | ||
| rss_config, | ||
| sprockets_proxy_bind_addr, | ||
| sp_config, | ||
| }; | ||
|
|
||
| // TODO: It's a little silly to pass the config this way - namely, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,13 +9,16 @@ use super::discovery::PeerMonitorObserver; | |
| use super::params::SledAgentRequest; | ||
| use crate::rack_setup::config::SetupServiceConfig; | ||
| use crate::rack_setup::service::Service; | ||
| use crate::sp::SpHandle; | ||
| use futures::stream::FuturesUnordered; | ||
| use futures::StreamExt; | ||
| use omicron_common::backoff::internal_service_policy; | ||
| use omicron_common::backoff::retry_notify; | ||
| use omicron_common::backoff::BackoffError; | ||
| use slog::Logger; | ||
| use std::net::SocketAddr; | ||
| use std::net::SocketAddrV6; | ||
| use std::time::Duration; | ||
| use thiserror::Error; | ||
| use tokio::sync::mpsc; | ||
| use tokio::sync::oneshot; | ||
|
|
@@ -43,6 +46,7 @@ impl RssHandle { | |
| log: &Logger, | ||
| config: SetupServiceConfig, | ||
| peer_monitor: PeerMonitorObserver, | ||
| sp: Option<SpHandle>, | ||
| ) -> Self { | ||
| let (tx, rx) = rss_channel(); | ||
|
|
||
|
|
@@ -54,7 +58,7 @@ impl RssHandle { | |
| ); | ||
| let log = log.new(o!("component" => "BootstrapAgentRssHandler")); | ||
| let task = tokio::spawn(async move { | ||
| rx.initialize_sleds(&log).await; | ||
| rx.initialize_sleds(&log, &sp).await; | ||
| }); | ||
| Self { _rss: rss, task } | ||
| } | ||
|
|
@@ -65,6 +69,9 @@ enum InitializeSledAgentError { | |
| #[error("Failed to construct an HTTP client: {0}")] | ||
| HttpClient(#[from] reqwest::Error), | ||
|
|
||
| #[error("Failed to start sprockets proxy: {0}")] | ||
| SprocketsProxy(#[from] sprockets_proxy::Error), | ||
|
|
||
| #[error("Error making HTTP request to Bootstrap Agent: {0}")] | ||
| BootstrapApi( | ||
| #[from] | ||
|
|
@@ -76,6 +83,7 @@ async fn initialize_sled_agent( | |
| log: &Logger, | ||
| bootstrap_addr: SocketAddrV6, | ||
| request: &SledAgentRequest, | ||
| sp: &Option<SpHandle>, | ||
| ) -> Result<(), InitializeSledAgentError> { | ||
| let dur = std::time::Duration::from_secs(60); | ||
|
|
||
|
|
@@ -84,8 +92,57 @@ async fn initialize_sled_agent( | |
| .timeout(dur) | ||
| .build()?; | ||
|
|
||
| let url = format!("http://{}", bootstrap_addr); | ||
| info!(log, "Sending request to peer agent: {}", url); | ||
| let (url, _proxy_task) = if let Some(sp) = sp.as_ref() { | ||
| // We have an SP; spawn a sprockets proxy for this connection. | ||
| let proxy_config = sprockets_proxy::Config { | ||
| bind_address: "[::1]:0".parse().unwrap(), | ||
| target_address: SocketAddr::V6(bootstrap_addr), | ||
| role: sprockets_proxy::Role::Client, | ||
| }; | ||
| // TODO-cleanup The `Duration` passed to `Proxy::new()` is the timeout | ||
| // for communicating with the RoT. Currently it can be set to anything | ||
| // at all (our simulated RoT always responds immediately). Should the | ||
| // value move to our config? | ||
| let proxy = sprockets_proxy::Proxy::new( | ||
| &proxy_config, | ||
| sp.manufacturing_public_key(), | ||
| sp.rot_handle(), | ||
| sp.rot_certs(), | ||
| Duration::from_secs(5), | ||
| log.new(o!("BootstrapAgentClientSprocketsProxy" | ||
| => proxy_config.target_address)), | ||
| ) | ||
| .await?; | ||
|
|
||
| let proxy_addr = proxy.local_addr(); | ||
|
|
||
| let proxy_task = tokio::spawn(async move { | ||
| // TODO-robustness `proxy.run()` only fails if `accept()`ing on our | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Panicking seems totally reasonable to me. |
||
| // already-bound listening socket fails, which means something has | ||
| // gone very wrong. Do we have any recourse other than panicking? | ||
| // What does dropshot do if `accept()` fails? | ||
| proxy.run().await.expect("sprockets client proxy failed"); | ||
| }); | ||
|
|
||
| // Wrap `proxy_task` in `AbortOnDrop`, which will abort it (shutting | ||
| // down the proxy) when we return. | ||
| let proxy_task = AbortOnDrop(proxy_task); | ||
|
|
||
| info!( | ||
| log, "Sending request to peer agent via sprockets proxy"; | ||
| "peer" => %bootstrap_addr, | ||
| "sprockets_proxy" => %proxy_addr, | ||
| ); | ||
| (format!("http://{}", proxy_addr), Some(proxy_task)) | ||
| } else { | ||
| // We have no SP; connect directly. | ||
| info!( | ||
| log, "Sending request to peer agent"; | ||
| "peer" => %bootstrap_addr, | ||
| ); | ||
| (format!("http://{}", bootstrap_addr), None) | ||
| }; | ||
|
|
||
| let client = bootstrap_agent_client::Client::new_with_client( | ||
| &url, | ||
| client, | ||
|
|
@@ -119,7 +176,7 @@ async fn initialize_sled_agent( | |
| }; | ||
| retry_notify(internal_service_policy(), sled_agent_initialize, log_failure) | ||
| .await?; | ||
| info!(log, "Peer agent at {} initialized", url); | ||
| info!(log, "Peer agent initialized"; "peer" => %bootstrap_addr); | ||
| Ok(()) | ||
| } | ||
|
|
||
|
|
@@ -178,7 +235,7 @@ struct BootstrapAgentHandleReceiver { | |
| } | ||
|
|
||
| impl BootstrapAgentHandleReceiver { | ||
| async fn initialize_sleds(mut self, log: &Logger) { | ||
| async fn initialize_sleds(mut self, log: &Logger, sp: &Option<SpHandle>) { | ||
| let (requests, tx_response) = match self.inner.recv().await { | ||
| Some(requests) => requests, | ||
| None => { | ||
|
|
@@ -201,7 +258,7 @@ impl BootstrapAgentHandleReceiver { | |
| "target_sled" => %bootstrap_addr, | ||
| ); | ||
|
|
||
| initialize_sled_agent(log, bootstrap_addr, &request) | ||
| initialize_sled_agent(log, bootstrap_addr, &request, sp) | ||
| .await | ||
| .map_err(|err| { | ||
| format!( | ||
|
|
@@ -241,3 +298,11 @@ impl BootstrapAgentHandleReceiver { | |
| tx_response.send(Ok(())).unwrap(); | ||
| } | ||
| } | ||
|
|
||
| struct AbortOnDrop<T>(JoinHandle<T>); | ||
|
|
||
| impl<T> Drop for AbortOnDrop<T> { | ||
| fn drop(&mut self) { | ||
| self.0.abort(); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We probably should lift this to a Github issue as you mentioned in the PR description; I don't think we want to lose track of this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍 I'll open an issue as a part of merging this PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Merged and opened #1161