Skip to content

Commit

Permalink
Sled Agent service initialization cleanup (#3712)
Browse files Browse the repository at this point in the history
Roughly 3 changes:
1. Skip initializing any zones we already know are running (both from
sled-agent's perspective and as reflected via `zoneadm`)
2. Log any errors `service_put` would return on the server-side.
3. Bump file-based zpools to 15G

Lack of space was causing service initialization to fail on
single-machine deployments using the `create_virtual_hardware` script.
(3) should fix this.

Because of `ENOSPC` errors, we were running into some future
cancellation issues which #3707 addressed. But actually determining the
underlying cause was a bit difficult as `RSS` was timing out (*) on the
`services_put` call and so never got back the error. (2) should
hopefully make this easier to catch in the future, e.g.:
```
04:12:31.312Z ERRO SledAgent: failed to init services: Failed to install zone 'oxz_clickhouse_32ce0d6f-38fa-41e7-b699-f0af4f4f1127' from '/opt/oxide/clickhouse.tar.gz': Failed to execute zoneadm command '
Install' for zone 'oxz_clickhouse_32ce0d6f-38fa-41e7-b699-f0af4f4f1127': Failed to parse command output: exit code 1
    stdout:
    A ZFS file system has been created for this zone.
    INFO: omicron: installing zone oxz_clickhouse_32ce0d6f-38fa-41e7-b699-f0af4f4f1127 @ "/pool/ext/24b4dc87-ab46-49fb-a4b4-d361ae214c03/crypt/zone/oxz_clickhouse_32ce0d6f-38fa-41e7-b699-f0af4f4f1127"...
    INFO: omicron: replicating /usr tree...
    INFO: omicron: replicating /lib tree...
    INFO: omicron: replicating /sbin tree...
    INFO: omicron: pruning SMF manifests...
    INFO: omicron: pruning global-only files...
    INFO: omicron: unpacking baseline archive...
    INFO: omicron: unpacking image "/opt/oxide/clickhouse.tar.gz"...
    stderr:
    Error: No space left on device (os error 28)
    sled_id = 8fd66238-6bb3-4f08-89bf-f88fc2320d83

```

(*) speaking of which, do we want to increase that timeout from just
60s? it's not like any subsequent requests will work considering they'll
be blocked on a lock from the initial request. and in failure cases like
this the subsequent requests will timeout as well, leaving a bunch of
tasks in sled-agent all waiting on the same lock to try initializing.
might be worth switching to the single processing task model rack/sled
initialization use.

Finally, (1) is to address the case where we try to initialize a zone
that's already running. That's sometimes fine as we'll eventually
realize the zone already exists. But in the case of a zone with an OPTE
port, we'll run into errors like so:
```
    error_message_internal = Failed to create OPTE port for service nexus: Failure interacting with the OPTE ioctl(2) interface: command CreateXde failed: MacExists { port: "opte12", vni: Vni { inner: 100 }, mac: MacAddr { inner: A8:40:25:FF:EC:09 } }
```
This happens because the
[check](https://github.com/oxidecomputer/omicron/blob/ebd3db27f6bbd08af3194cae84b0efc6b6e7248d/illumos-utils/src/zone.rs#L282C27-L282C27)
for "is zone running" comes after we do all the work necessary to create
the zone (e.g. creating an opte port). (1) updates
`initialize_services_locked` to cross-reference sled-agent and the
system's view of what zones are running so that we don't try to do the
unnecessary work.

---------

Co-authored-by: Sean Klein <sean@oxide.computer>
  • Loading branch information
luqmana and smklein committed Jul 19, 2023
1 parent ebd3db2 commit 1eec1a3
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 35 deletions.
14 changes: 12 additions & 2 deletions sled-agent/src/http_entrypoints.rs
Expand Up @@ -167,8 +167,18 @@ async fn services_put(
// times out) could result in leaving zones partially configured and the
// in-memory state of the service manager invalid. See:
// oxidecomputer/omicron#3098.
match tokio::spawn(async move { sa.services_ensure(body_args).await }).await
{
let handler = async move {
match sa.services_ensure(body_args).await {
Ok(()) => Ok(()),
Err(e) => {
// Log the error here to make things clear even if the client
// has already disconnected.
error!(sa.logger(), "failed to initialize services: {e}");
Err(e)
}
}
};
match tokio::spawn(handler).await {
Ok(result) => result.map_err(|e| Error::from(e))?,

Err(e) => {
Expand Down
96 changes: 65 additions & 31 deletions sled-agent/src/services.rs
Expand Up @@ -84,6 +84,7 @@ use sled_hardware::underlay::BOOTSTRAP_PREFIX;
use sled_hardware::Baseboard;
use sled_hardware::SledMode;
use slog::Logger;
use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::collections::HashSet;
use std::io::Cursor;
Expand Down Expand Up @@ -362,7 +363,7 @@ pub struct ServiceManagerInner {
switch_zone_maghemite_links: Vec<PhysicalLink>,
sidecar_revision: SidecarRevision,
// Zones representing running services
zones: Mutex<Vec<RunningZone>>,
zones: Mutex<BTreeMap<String, RunningZone>>,
underlay_vnic_allocator: VnicAllocator<Etherstub>,
underlay_vnic: EtherstubVnic,
bootstrap_vnic_allocator: VnicAllocator<Etherstub>,
Expand Down Expand Up @@ -431,7 +432,7 @@ impl ServiceManager {
time_synced: AtomicBool::new(false),
sidecar_revision,
switch_zone_maghemite_links,
zones: Mutex::new(vec![]),
zones: Mutex::new(BTreeMap::new()),
underlay_vnic_allocator: VnicAllocator::new(
"Service",
underlay_etherstub,
Expand Down Expand Up @@ -1908,7 +1909,7 @@ impl ServiceManager {
// Populates `existing_zones` according to the requests in `services`.
async fn initialize_services_locked(
&self,
existing_zones: &mut Vec<RunningZone>,
existing_zones: &mut BTreeMap<String, RunningZone>,
requests: &Vec<ZoneRequest>,
) -> Result<(), Error> {
if let Some(name) = requests
Expand All @@ -1923,16 +1924,17 @@ impl ServiceManager {
});
}

// We initialize all the zones we can, but only return the first error.
// We initialize all the zones we can, but only return one error, if
// any.
let local_existing_zones = Arc::new(Mutex::new(existing_zones));
let first_err = Arc::new(Mutex::new(None));
let last_err = Arc::new(Mutex::new(None));
stream::iter(requests)
// WARNING: Do not use "try_for_each_concurrent" here -- if you do,
// it's possible that the future will cancel other ongoing requests
// to "initialize_zone".
.for_each_concurrent(None, |request| {
let local_existing_zones = local_existing_zones.clone();
let first_err = first_err.clone();
let last_err = last_err.clone();
async move {
match self
.initialize_zone(
Expand All @@ -1943,20 +1945,20 @@ impl ServiceManager {
.await
{
Ok(running_zone) => {
local_existing_zones
.lock()
.await
.push(running_zone);
local_existing_zones.lock().await.insert(
running_zone.name().to_string(),
running_zone,
);
}
Err(err) => {
*first_err.lock().await = Some(err);
*last_err.lock().await = Some(err);
}
}
}
})
.await;

if let Some(err) = Arc::into_inner(first_err)
if let Some(err) = Arc::into_inner(last_err)
.expect("Should have last reference")
.into_inner()
{
Expand Down Expand Up @@ -2245,9 +2247,7 @@ impl ServiceManager {
.map_err(Error::from);
}
}
if let Some(zone) =
self.inner.zones.lock().await.iter().find(|z| z.name() == name)
{
if let Some(zone) = self.inner.zones.lock().await.get(name) {
return self
.create_zone_bundle_impl(zone)
.await
Expand Down Expand Up @@ -2424,9 +2424,14 @@ impl ServiceManager {
{
zone_names.push(String::from(zone.name()))
}
for zone in self.inner.zones.lock().await.iter() {
zone_names.push(String::from(zone.name()));
}
zone_names.extend(
self.inner
.zones
.lock()
.await
.values()
.map(|zone| zone.name().to_string()),
);
zone_names.sort();
Ok(zone_names)
}
Expand Down Expand Up @@ -2480,7 +2485,7 @@ impl ServiceManager {
// re-instantiated on boot.
async fn ensure_all_services(
&self,
existing_zones: &mut MutexGuard<'_, Vec<RunningZone>>,
existing_zones: &mut MutexGuard<'_, BTreeMap<String, RunningZone>>,
old_request: &AllZoneRequests,
request: ServiceEnsureBody,
) -> Result<AllZoneRequests, Error> {
Expand All @@ -2502,11 +2507,7 @@ impl ServiceManager {
// Destroy zones that should not be running
for zone in zones_to_be_removed {
let expected_zone_name = zone.zone_name();
if let Some(idx) = existing_zones
.iter()
.position(|z| z.name() == expected_zone_name)
{
let mut zone = existing_zones.remove(idx);
if let Some(mut zone) = existing_zones.remove(&expected_zone_name) {
if let Err(e) = zone.stop().await {
error!(log, "Failed to stop zone {}: {e}", zone.name());
}
Expand All @@ -2520,6 +2521,36 @@ impl ServiceManager {
let all_u2_roots =
self.inner.storage.all_u2_mountpoints(ZONE_DATASET).await;
for zone in zones_to_be_added {
// Check if we think the zone should already be running
let name = zone.zone_name();
if existing_zones.contains_key(&name) {
// Make sure the zone actually exists in the right state too
match Zones::find(&name).await {
Ok(Some(zone)) if zone.state() == zone::State::Running => {
info!(log, "skipping running zone"; "zone" => &name);
continue;
}
_ => {
// Mismatch between SA's view and reality, let's try to
// clean up any remanants and try initialize it again
warn!(
log,
"expected to find existing zone in running state";
"zone" => &name,
);
if let Err(e) =
existing_zones.remove(&name).unwrap().stop().await
{
error!(
log,
"Failed to stop zone";
"zone" => &name,
"error" => %e,
);
}
}
}
}
// For each new zone request, we pick an arbitrary U.2 to store
// the zone filesystem. Note: This isn't known to Nexus right now,
// so it's a local-to-sled decision.
Expand Down Expand Up @@ -2552,7 +2583,7 @@ impl ServiceManager {
pub async fn cockroachdb_initialize(&self) -> Result<(), Error> {
let log = &self.inner.log;
let dataset_zones = self.inner.zones.lock().await;
for zone in dataset_zones.iter() {
for zone in dataset_zones.values() {
// TODO: We could probably store the ZoneKind in the running zone to
// make this "comparison to existing zones by name" mechanism a bit
// safer.
Expand Down Expand Up @@ -2608,7 +2639,10 @@ impl ServiceManager {
Ok(())
}

pub fn boottime_rewrite(&self, zones: &Vec<RunningZone>) {
pub fn boottime_rewrite<'a>(
&self,
zones: impl Iterator<Item = &'a RunningZone>,
) {
if self
.inner
.time_synced
Expand All @@ -2626,7 +2660,6 @@ impl ServiceManager {
info!(self.inner.log, "Setting boot time to {:?}", now);

let files: Vec<Utf8PathBuf> = zones
.iter()
.map(|z| z.root())
.chain(iter::once(Utf8PathBuf::from("/")))
.flat_map(|r| [r.join("var/adm/utmpx"), r.join("var/adm/wtmpx")])
Expand Down Expand Up @@ -2655,7 +2688,7 @@ impl ServiceManager {

if let Some(true) = self.inner.skip_timesync {
info!(self.inner.log, "Configured to skip timesync checks");
self.boottime_rewrite(&existing_zones);
self.boottime_rewrite(existing_zones.values());
return Ok(TimeSync { sync: true, skew: 0.00, correction: 0.00 });
};

Expand All @@ -2664,8 +2697,9 @@ impl ServiceManager {

let ntp_zone = existing_zones
.iter()
.find(|z| z.name().starts_with(&ntp_zone_name))
.ok_or_else(|| Error::NtpZoneNotReady)?;
.find(|(name, _)| name.starts_with(&ntp_zone_name))
.ok_or_else(|| Error::NtpZoneNotReady)?
.1;

// XXXNTP - This could be replaced with a direct connection to the
// daemon using a patched version of the chrony_candm crate to allow
Expand All @@ -2687,7 +2721,7 @@ impl ServiceManager {
&& correction.abs() <= 0.05;

if sync {
self.boottime_rewrite(&existing_zones);
self.boottime_rewrite(existing_zones.values());
}

Ok(TimeSync { sync, skew, correction })
Expand Down
6 changes: 6 additions & 0 deletions sled-agent/src/sled_agent.rs
Expand Up @@ -210,6 +210,7 @@ impl SledAgentInner {
#[derive(Clone)]
pub struct SledAgent {
inner: Arc<SledAgentInner>,
log: Logger,
}

impl SledAgent {
Expand Down Expand Up @@ -351,6 +352,7 @@ impl SledAgent {
// Also, we could maybe de-dup some of the backoff code in the request queue?
nexus_request_queue: NexusRequestQueue::new(),
}),
log: log.clone(),
};

// We immediately add a notification to the request queue about our
Expand Down Expand Up @@ -475,6 +477,10 @@ impl SledAgent {
self.inner.id
}

pub fn logger(&self) -> &Logger {
&self.log
}

// Sends a request to Nexus informing it that the current sled exists.
fn notify_nexus_about_self(&self, log: &Logger) {
let sled_id = self.inner.id;
Expand Down
2 changes: 1 addition & 1 deletion tools/create_virtual_hardware.sh
Expand Up @@ -47,7 +47,7 @@ function ensure_zpools {
echo "Zpool: [$ZPOOL]"
VDEV_PATH="${ZPOOL_VDEV_DIR:-$OMICRON_TOP}/$ZPOOL.vdev"
if ! [[ -f "$VDEV_PATH" ]]; then
dd if=/dev/zero of="$VDEV_PATH" bs=1 count=0 seek=10G
dd if=/dev/zero of="$VDEV_PATH" bs=1 count=0 seek=15G
fi
success "ZFS vdev $VDEV_PATH exists"
if [[ -z "$(zpool list -o name | grep $ZPOOL)" ]]; then
Expand Down
2 changes: 1 addition & 1 deletion tools/destroy_virtual_hardware.sh
Expand Up @@ -108,7 +108,7 @@ function try_destroy_zpools {
for ZPOOL in "${ZPOOLS[@]}"; do
VDEV_FILE="${ZPOOL_VDEV_DIR:-$OMICRON_TOP}/$ZPOOL.vdev"
zfs destroy -r "$ZPOOL" && \
zfs unmount "$ZPOOL" && \
(zfs unmount "$ZPOOL" || true) && \
zpool destroy "$ZPOOL" && \
rm -f "$VDEV_FILE" || \
warn "Failed to remove ZFS pool and vdev: $ZPOOL"
Expand Down

0 comments on commit 1eec1a3

Please sign in to comment.