Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions sled-agent/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ serde = { version = "1.0", features = [ "derive" ] }
serde_json = "1.0"
slog = { version = "2.5", features = [ "max_level_trace", "release_max_level_debug" ] }
smf = "0.2"
socket2 = { version = "0.4", features = [ "all" ] }
structopt = "0.3"
tar = "0.4"
tempfile = "3.2"
Expand Down
156 changes: 118 additions & 38 deletions sled-agent/src/bootstrap/agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,25 @@

use super::client::types as bootstrap_types;
use super::client::Client as BootstrapClient;
use super::discovery;
use super::views::ShareResponse;
use omicron_common::api::external::Error;
use omicron_common::api::external::Error as ExternalError;
use omicron_common::backoff::{
internal_service_policy, retry_notify, BackoffError,
};
use omicron_common::packaging::sha256_digest;

use slog::Logger;
use std::collections::HashMap;
use std::fs::File;
use std::io::{Seek, SeekFrom};
use std::net::SocketAddr;
use std::path::Path;
use tar::Archive;
use thiserror::Error;

const UNLOCK_THRESHOLD: usize = 2;
const BOOTSTRAP_PORT: u16 = 12346;

/// Describes errors which may occur while operating the bootstrap service.
#[derive(Error, Debug)]
pub enum BootstrapError {
Expand All @@ -39,24 +45,35 @@ pub enum BootstrapError {

#[error("Error making HTTP request")]
Api(#[from] anyhow::Error),

#[error("Not enough peers to unlock storage")]
NotEnoughPeers,
}

impl From<BootstrapError> for ExternalError {
fn from(err: BootstrapError) -> Self {
Self::internal_error(&err.to_string())
}
}

/// The entity responsible for bootstrapping an Oxide rack.
pub(crate) struct Agent {
/// Debug log
log: Logger,
peer_monitor: discovery::PeerMonitor,
}

impl Agent {
pub fn new(log: Logger) -> Self {
Agent { log }
pub fn new(log: Logger) -> Result<Self, BootstrapError> {
let peer_monitor = discovery::PeerMonitor::new(&log)?;
Ok(Agent { log, peer_monitor })
}

/// Implements the "request share" API.
pub async fn request_share(
&self,
identity: Vec<u8>,
) -> Result<ShareResponse, Error> {
) -> Result<ShareResponse, BootstrapError> {
// TODO-correctness: Validate identity, return whatever
// information is necessary to establish trust quorum.
//
Expand All @@ -66,42 +83,92 @@ impl Agent {
Ok(ShareResponse { shared_secret: vec![] })
}

/// Performs device initialization:
/// Communicates with peers, sharing secrets, until the rack has been
/// sufficiently unlocked.
///
/// - TODO: Communicates with other sled agents to establish a trust quorum.
/// - Verifies, unpacks, and launches other services.
pub async fn initialize(
&self,
other_agents: Vec<SocketAddr>,
) -> Result<(), BootstrapError> {
info!(&self.log, "bootstrap service initializing");
// TODO-correctness:
// - Establish trust quorum.
// - Once this is done, "unlock" local storage
//
// The current implementation sends a stub request to all known
// sled agents, but does not actually create a quorum / unlock
// anything.
let other_agents: Vec<BootstrapClient> = other_agents
.into_iter()
.map(|addr| {
let addr_str = addr.to_string();
BootstrapClient::new(
&format!("http://{}", addr_str,),
self.log.new(o!(
"Address" => addr_str,
)),
/// - This method retries until [`UNLOCK_THRESHOLD`] other agents are
/// online, and have successfully responded to "share requests".
async fn establish_sled_quorum(&self) -> Result<(), BootstrapError> {
retry_notify(
internal_service_policy(),
|| async {
let other_agents = self.peer_monitor.addrs().await;
info!(&self.log, "Bootstrap: Communicating with peers: {:?}", other_agents);

// "-1" to account for ourselves.
//
// NOTE: Clippy error exists while the compile-time unlock
// threshold is "1", because we basically don't require any
// peers to unlock.
#[allow(clippy::absurd_extreme_comparisons)]
if other_agents.len() < UNLOCK_THRESHOLD - 1 {
warn!(&self.log, "Not enough peers to start establishing quorum");
return Err(BackoffError::Transient(
BootstrapError::NotEnoughPeers,
));
}
info!(&self.log, "Bootstrap: Enough peers to start share transfer");

// TODO-correctness:
// - Establish trust quorum.
// - Once this is done, "unlock" local storage
//
// The current implementation sends a stub request to all known sled
// agents, but does not actually create a quorum / unlock anything.
let other_agents: Vec<BootstrapClient> = other_agents
.into_iter()
.map(|mut addr| {
addr.set_port(BOOTSTRAP_PORT);
// TODO-correctness:
//
// Many rust crates - such as "URL" - really dislike
// using scopes in IPv6 addresses. Using
// "addr.to_string()" results in an IP address format
// that is rejected when embedded into a URL.
//
// Instead, we merely use IP and port for the moment,
// which loses the scope information. Longer-term, if we
// use ULAs (Unique Local Addresses) the scope shouldn't
// be a factor anyway.
let addr_str = format!("[{}]:{}", addr.ip(), addr.port());
info!(&self.log, "bootstrap: Connecting to {}", addr_str);
BootstrapClient::new(
&format!("http://{}", addr_str),
self.log.new(o!(
"Address" => addr_str,
)),
)
})
.collect();
for agent in &other_agents {
agent
.api_request_share(&bootstrap_types::ShareRequest {
identity: vec![],
})
.await
.map_err(|e| {
info!(&self.log, "Bootstrap: Failed to share request with peer: {:?}", e);
BackoffError::Transient(BootstrapError::Api(e))
})?;
info!(&self.log, "Bootstrap: Shared request with peer");
}
Ok(())
},
|error, duration| {
warn!(
self.log,
"Failed to unlock sleds (will retry after {:?}: {:#}",
duration,
error,
)
})
.collect();
for agent in &other_agents {
agent
.api_request_share(&bootstrap_types::ShareRequest {
identity: vec![],
})
.await?;
}
},
)
.await?;

Ok(())
}

async fn launch_local_services(&self) -> Result<(), BootstrapError> {
let tar_source = Path::new("/opt/oxide");
let destination = Path::new("/opt/oxide");
// TODO-correctness: Validation should come from ROT, not local file.
Expand Down Expand Up @@ -129,6 +196,19 @@ impl Agent {
Ok(())
}

/// Performs device initialization:
///
/// - TODO: Communicates with other sled agents to establish a trust quorum.
/// - Verifies, unpacks, and launches other services.
pub async fn initialize(&self) -> Result<(), BootstrapError> {
info!(&self.log, "bootstrap service initializing");

self.establish_sled_quorum().await?;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe there needs to be a determination about phases here. If a sled does not already know the group of existing sleds (saved in a file) where each sled id is a public key or key fingerprint, that sled is not capable of unlocking itself, or doing much of anything except listening on its multicast address for a request from the primary. ( Note that we use the public key as sled id, since we want to allow movement of sleds throughout the rack and for their IP addresses to change. )

The above scenario is going to be the case of all sleds except the primary during initial rack setup. The primary will start a PeerMonitor and wait for (in the demo case) a predefined number of sleds to respond. The primary will then connect over TCP to a predetermined SPDM port at each of the sled-agents that responded, using the responded IP address. The primary will then become an SPDM requester and attempt to create a secure channel (and pretend to attest measurements), over that TCP connection. When all the sleds are connected via secure SPDM channels, a secret will be generated and distributed to each sled-agent along with its individual key-share and group information. This is Phase 1 of the protocol and only happens during rack initialization. We aren't yet considering what it means to add a sled to the group at runtime.

Phase 2 is what I believe establish_sled_quorum was meant to encapsulate. This is where each sled already has a key share and knows the group members. In this case, when a sled restarts it will run a PeerMonitor to get the IPs of a threshold of sleds and then create a secure SPDM channel to each of those sled-agents. Both sides of the SPDM channel should ensure that any received certs or digests actually match the group information, although its unclear if we need to do this for the demo. This is because while SPDM supports mutual authentication, it's not yet implemented, and so we are going to pretend to setup a secure channel by running the protocol up to the implemented challenge authentication phase. Once our pseudo-SPDM secured channel is established the remote sled can send the requested key share. When a quorum is retrieved the rack secret can be reconstructed and the sled unlocked.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, this code - as implemented - does not have a phase explicitly for sharing keys.

I don't have a strong opinion about whether or not this is implemented in establish_sled_quorum or not - but as long as it happens after the PeerMonitor is up and running, I'm happy.

self.launch_local_services().await?;

Ok(())
}

fn launch<S, P1, P2>(
&self,
digests: &HashMap<String, Vec<u8>>,
Expand Down
93 changes: 93 additions & 0 deletions sled-agent/src/bootstrap/discovery.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Sled announcement and discovery.

use super::multicast;
use slog::Logger;
use std::collections::HashSet;
use std::io;
use std::net::{Ipv6Addr, SocketAddr, SocketAddrV6};
use std::sync::Arc;
use tokio::net::UdpSocket;
use tokio::sync::Mutex;
use tokio::task::JoinHandle;

/// Manages Sled Discovery - both our announcement to other Sleds,
/// as well as our discovery of those sleds.
pub struct PeerMonitor {
sleds: Arc<Mutex<HashSet<SocketAddr>>>,
_worker: JoinHandle<()>,
}

async fn monitor_worker(
log: Logger,
address: SocketAddrV6,
sender: UdpSocket,
listener: UdpSocket,
sleds: Arc<Mutex<HashSet<SocketAddr>>>,
) {
// Let this message be a reminder that this content is *not*
// encrypted, authenticated, or otherwise verified. We're just using
// it as a starting point for swapping addresses.
let message =
b"We've been trying to reach you about your car's extended warranty";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lol

loop {
let mut buf = vec![0u8; 128];
tokio::select! {
_ = tokio::time::sleep(tokio::time::Duration::from_millis(5000)) => {
info!(log, "Bootstrap Peer Monitor: Broadcasting our own address: {}", address);
if let Err(e) = sender.try_send_to(message, address.into()) {
warn!(log, "PeerMonitor failed to broadcast: {}", e);
}
}
result = listener.recv_from(&mut buf) => {
match result {
Ok((_, addr)) => {
info!(log, "Bootstrap Peer Monitor: Successfully received an address: {}", addr);
sleds.lock().await.insert(addr);
Comment on lines +47 to +49
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would ideally be the address we use for subsequent communication with the sled.

Admittedly the sled agent / bootstrap servers are using an SMF file configured to use IPv4 addresses, but this is an IPv6 address. That presents a bit of a challenge - presumably we'd want everyone to be communicating over IPv6 in the long-term, no?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently there is no plan to have any IPv4 on the underlay.

I think it's still an open question how stable addresses will find their way onto servers. The approach I've been taking in my testing setups is the following.

  • Each router running on a rack switch is started with an IPv6 /56 prefix, who provides that? ... not sure.
  • When routers running on servers peer with a rack-level router, the rack-level router delegates a /64 to them and then the router running on the server automatically assigns the first address in that /64 to the server.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will make the change to use IPv6 for the bootstrap server explicitly within this change.

I hear you on the "no IPv4 in the underlay" - we should trend in that direction - but that transition can be more gradual.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Filing #442 to track

},
Err(e) => warn!(log, "PeerMonitor failed to receive: {}", e),
}
}
}
}
}

impl PeerMonitor {
/// Creates a new [`PeerMonitor`].
// TODO: Address, port, interface, etc, probably should be
// configuration options.
pub fn new(log: &Logger) -> Result<Self, io::Error> {
let scope = multicast::Ipv6MulticastScope::LinkLocal.first_hextet();
let address = SocketAddrV6::new(
Ipv6Addr::new(scope, 0, 0, 0, 0, 0, 0, 0x1),
7645,
0,
0,
);
Comment on lines +64 to +69
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Other than the scope - which is intentional - the rest of this address is 100% arbitrary.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should use either Admin-Local or Site-Local scope here. It's not guaranteed that there will be an L2 domain across the rack. RFD 63 lays out two possible paths for bootstrapping the rack, one that has an L2 broadcast domain for starting up, and another that starts in L3. I'm personally leaning toward the latter so we do not have to change the shape of the network as a part of starting up. Admin-Local or Site-Local scopes should work for either alternative, but Link-Local will only work for the former.

I'd suggest coming up with a set of constant/well-known multicast group addresses that correspond to particular communication domains. For example for Rift peering in Maghemite we use ff02::a1f7

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment definitely led me to some required reading. Thank you for the feedback @rcgoodfellow. I still have a few questions on this though.

  1. I keep reading that site-local IPv6 addresses are deprecated. What impact would that have on choosing site-local as a multicast scope?
  2. If we use a site-local multicast scope, does that mean that the "stable" addresses assigned to the link must be in the site-local format?
  3. It looks like admin-local multicast scope doesn't have a corresponding format for addresses. Are these just global unicast addresses then?
  4. Will the router on the switches prevent admin-local or site-local multicast from leaving a single rack? Is a "site" or smallest administrative domain used for admin-local just something that we are allowed to determine? In other words, can Oxide just go ahead and say any site-local or admin-local traffic must be contained within a single rack?
  5. If we went with an L2 domain across the rack, and allowed use of link-local addresses, we could ensure that bootstrap traffic never left the rack, and also ensure that the sled-agent bootstrap server was inaccessible from outside the rack automatically. With global unicast addresses (assuming that's what we use for site-local/admin-local), how do we ensure this? Firewall rules?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. My understanding is that site-local ULAs were deprecated, but site-local multicast addresses are still OK. However, it seems to me that some of the discussion in RFC 3879 also applies to site-local multicast, in particular section 2.5. We do completely own our networks, so we can take it upon ourselves to come up with a definition of "site" that is useful to us (and understood by our routers).
  2. I do not believe so, but this is a good question to get a concrete answer for.
  3. By corresponding, do you mean the source address that will be used to send messages to the multicast group? If so I think the answer is tied to (2) e.g. I do not believe multicast scope constrains source address scope, but I'll find out for sure.
  4. I believe this is the very purpose of the admin-local scope, to let admins decide what is "local". And I believe in either case the answer is yes.
  5. For global unicast addresses sending to some sort of scoped multicast address (or any multicast address for that matter) in order for that traffic to leave the rack, some router will need to route it out of the rack. We can constrain the propagation of multicast to only live within a single rack in a number of ways (only allowing multicast to route to servers, limiting TTLs, etc.) For unicast traffic, we could similarly have an address space that is only routed within a rack. Just to be clear, we are talking about the server NICs and not the SP NICs right?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks Ry

By corresponding, do you mean the source address that will be used to send messages to the multicast group? If so I think the answer is tied to (2) e.g. I do not believe multicast scope constrains source address scope, but I'll find out for sure.

Yes, exactly. There is a site specific unicast address and a multicast scope, but only a multicast cope for admin, but no related unicast address.

Just to be clear, we are talking about the server NICs and not the SP NICs right?
Yes.

let loopback = false;
let interface = 0;
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd be happy to pick a more specific interface, if there was a good way to do so. Feedback welcome.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a tricky question. I have not given much thought to multicast routing yet. At a basic level there are two potentially viable interfaces for this software to chose from, and I think the right answer might be both, so not specifying the interface for now seems reasonable. This means that traffic will egress on both interfaces and also ingress on both interfaces for the receiving servers.

let (sender, listener) =
multicast::new_ipv6_udp_pair(&address, loopback, interface)?;

let sleds = Arc::new(Mutex::new(HashSet::new()));
let sleds_for_worker = sleds.clone();
let log = log.clone();

let worker = tokio::task::spawn(async move {
monitor_worker(log, address, sender, listener, sleds_for_worker)
.await
});

Ok(PeerMonitor { sleds, _worker: worker })
}

/// Returns the addresses of connected sleds.
///
/// Note: These sleds have not yet been verified.
pub async fn addrs(&self) -> Vec<SocketAddr> {
self.sleds.lock().await.iter().map(|addr| *addr).collect()
}
}
8 changes: 7 additions & 1 deletion sled-agent/src/bootstrap/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ use dropshot::HttpError;
use dropshot::HttpResponseOk;
use dropshot::RequestContext;
use dropshot::TypedBody;
use omicron_common::api::external::Error as ExternalError;
use std::sync::Arc;

use super::agent::Agent;
Expand Down Expand Up @@ -62,5 +63,10 @@ async fn api_request_share(
let bootstrap_agent = rqctx.context();

let request = request.into_inner();
Ok(HttpResponseOk(bootstrap_agent.request_share(request.identity).await?))
Ok(HttpResponseOk(
bootstrap_agent
.request_share(request.identity)
.await
.map_err(|e| ExternalError::from(e))?,
))
}
2 changes: 2 additions & 0 deletions sled-agent/src/bootstrap/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
pub mod agent;
mod client;
pub mod config;
mod discovery;
mod http_entrypoints;
mod multicast;
mod params;
pub mod server;
mod views;
Loading