diff --git a/Cargo.lock b/Cargo.lock index 7abfee7634b..91ea6780212 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3464,6 +3464,7 @@ dependencies = [ "clap 3.2.6", "dropshot", "expectorate", + "internal-dns-client", "nexus-client 0.1.0", "omicron-common 0.1.0", "omicron-test-utils", diff --git a/common/src/address.rs b/common/src/address.rs index 3dee3848b9e..708fbff12bd 100644 --- a/common/src/address.rs +++ b/common/src/address.rs @@ -33,9 +33,11 @@ pub const SLED_AGENT_PORT: u16 = 12345; /// The port propolis-server listens on inside the propolis zone. pub const PROPOLIS_PORT: u16 = 12400; - +pub const CLICKHOUSE_PORT: u16 = 8123; pub const OXIMETER_PORT: u16 = 12223; +pub const NEXUS_INTERNAL_PORT: u16 = 12221; + // Anycast is a mechanism in which a single IP address is shared by multiple // devices, and the destination is located based on routing distance. // diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 50ceeb191a3..e96ea3ae835 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -18,7 +18,7 @@ use oximeter_collector::Oximeter; use oximeter_producer::Server as ProducerServer; use slog::o; use slog::Logger; -use std::net::{IpAddr, Ipv6Addr, SocketAddr}; +use std::net::{IpAddr, Ipv6Addr, SocketAddr, SocketAddrV6}; use std::path::Path; use std::time::Duration; use uuid::Uuid; @@ -199,21 +199,20 @@ pub async fn start_oximeter( id: Uuid, ) -> Result { let db = oximeter_collector::DbConfig { - address: SocketAddr::new(Ipv6Addr::LOCALHOST.into(), db_port), + address: Some(SocketAddr::new(Ipv6Addr::LOCALHOST.into(), db_port)), batch_size: 10, batch_interval: 1, }; let config = oximeter_collector::Config { - id, - nexus_address, + nexus_address: Some(nexus_address), db, - dropshot: ConfigDropshot { - bind_address: SocketAddr::new(Ipv6Addr::LOCALHOST.into(), 0), - ..Default::default() - }, log: ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Error }, }; - Oximeter::with_logger(&config, log).await.map_err(|e| e.to_string()) + let args = oximeter_collector::OximeterArguments { + id, + address: SocketAddrV6::new(Ipv6Addr::LOCALHOST, 0, 0, 0), + }; + Oximeter::with_logger(&config, &args, log).await.map_err(|e| e.to_string()) } #[derive(Debug, Clone, oximeter::Target)] diff --git a/oximeter/collector/Cargo.toml b/oximeter/collector/Cargo.toml index 7e36050d9af..10fe6058c0a 100644 --- a/oximeter/collector/Cargo.toml +++ b/oximeter/collector/Cargo.toml @@ -8,6 +8,7 @@ license = "MPL-2.0" [dependencies] clap = { version = "3.2", features = ["derive"] } dropshot = { git = "https://github.com/oxidecomputer/dropshot", branch = "main", features = [ "usdt-probes" ] } +internal-dns-client = { path = "../../internal-dns-client" } nexus-client = { path = "../../nexus-client" } omicron-common = { path = "../../common" } oximeter = { path = "../oximeter" } diff --git a/oximeter/collector/config.toml b/oximeter/collector/config.toml index 6b03a3974d2..0e8557a71bf 100644 --- a/oximeter/collector/config.toml +++ b/oximeter/collector/config.toml @@ -1,16 +1,9 @@ # Example configuration file for running an oximeter collector server -id = "1da65e5b-210c-4859-a7d7-200c1e659972" -nexus_address = "127.0.0.1:12221" - [db] -address = "[::1]:8123" batch_size = 1000 batch_interval = 5 # In seconds [log] level = "debug" mode = "stderr-terminal" - -[dropshot] -bind_address = "[::1]:12223" diff --git a/oximeter/collector/src/bin/oximeter.rs b/oximeter/collector/src/bin/oximeter.rs index 19f9b5b3da0..bf54cf33fa0 100644 --- a/oximeter/collector/src/bin/oximeter.rs +++ b/oximeter/collector/src/bin/oximeter.rs @@ -8,8 +8,10 @@ use clap::Parser; use omicron_common::cmd::fatal; use omicron_common::cmd::CmdError; -use oximeter_collector::{oximeter_api, Config, Oximeter}; +use oximeter_collector::{oximeter_api, Config, Oximeter, OximeterArguments}; +use std::net::SocketAddrV6; use std::path::PathBuf; +use uuid::Uuid; pub fn run_openapi() -> Result<(), String> { oximeter_api() @@ -24,18 +26,22 @@ pub fn run_openapi() -> Result<(), String> { /// Run an oximeter metric collection server in the Oxide Control Plane. #[derive(Parser)] #[clap(name = "oximeter", about = "See README.adoc for more information")] -struct Args { - #[clap( - short = 'O', - long = "openapi", - help = "Print the external OpenAPI Spec document and exit", - action - )] - openapi: bool, - - /// Path to TOML file with configuration for the server - #[clap(name = "CONFIG_FILE", action)] - config_file: PathBuf, +enum Args { + /// Print the external OpenAPI Spec document and exit + Openapi, + + /// Start an Oximeter server + Run { + /// Path to TOML file with configuration for the server + #[clap(name = "CONFIG_FILE", action)] + config_file: PathBuf, + + #[clap(short, long, action)] + id: Uuid, + + #[clap(short, long, action)] + address: SocketAddrV6, + }, } #[tokio::main] @@ -47,15 +53,17 @@ async fn main() { async fn do_run() -> Result<(), CmdError> { let args = Args::parse(); - let config = Config::from_file(args.config_file).unwrap(); - if args.openapi { - run_openapi().map_err(CmdError::Failure) - } else { - Oximeter::new(&config) - .await - .unwrap() - .serve_forever() - .await - .map_err(|e| CmdError::Failure(e.to_string())) + match args { + Args::Openapi => run_openapi().map_err(CmdError::Failure), + Args::Run { config_file, id, address } => { + let config = Config::from_file(config_file).unwrap(); + let args = OximeterArguments { id, address }; + Oximeter::new(&config, &args) + .await + .unwrap() + .serve_forever() + .await + .map_err(|e| CmdError::Failure(e.to_string())) + } } } diff --git a/oximeter/collector/src/lib.rs b/oximeter/collector/src/lib.rs index 8be720f3e51..85151aaa5db 100644 --- a/oximeter/collector/src/lib.rs +++ b/oximeter/collector/src/lib.rs @@ -11,6 +11,11 @@ use dropshot::{ HttpResponseUpdatedNoContent, HttpServer, HttpServerStarter, RequestContext, TypedBody, }; +use internal_dns_client::{ + multiclient::{ResolveError, Resolver}, + names::{ServiceName, SRV}, +}; +use omicron_common::address::{CLICKHOUSE_PORT, NEXUS_INTERNAL_PORT}; use omicron_common::api::internal::nexus::ProducerEndpoint; use omicron_common::backoff; use oximeter::types::{ProducerResults, ProducerResultsItem}; @@ -18,7 +23,7 @@ use oximeter_db::{Client, DbWrite}; use serde::{Deserialize, Serialize}; use slog::{debug, error, info, o, trace, warn, Drain, Logger}; use std::collections::{btree_map::Entry, BTreeMap}; -use std::net::SocketAddr; +use std::net::{SocketAddr, SocketAddrV6}; use std::path::Path; use std::sync::Arc; use std::time::Duration; @@ -37,6 +42,9 @@ pub enum Error { #[error(transparent)] Database(#[from] oximeter_db::Error), + + #[error(transparent)] + ResolveError(#[from] ResolveError), } // Messages for controlling a collection task @@ -231,8 +239,11 @@ async fn results_sink( /// Configuration for interacting with the metric database. #[derive(Debug, Clone, Copy, Deserialize, Serialize)] pub struct DbConfig { - /// Address of the ClickHouse server - pub address: SocketAddr, + /// Optional address of the ClickHouse server. + /// + /// If "None", will be inferred from DNS. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub address: Option, /// Batch size of samples at which to insert pub batch_size: usize, @@ -259,6 +270,7 @@ impl OximeterAgent { pub async fn with_id( id: Uuid, db_config: DbConfig, + resolver: &Resolver, log: &Logger, ) -> Result { let (result_sender, result_receiver) = mpsc::channel(8); @@ -267,7 +279,17 @@ impl OximeterAgent { // Construct the ClickHouse client first, propagate an error if we can't reach the // database. - let client = Client::new(db_config.address, &log); + let db_address = if let Some(address) = db_config.address { + address + } else { + SocketAddr::new( + resolver + .lookup_ip(SRV::Service(ServiceName::Clickhouse)) + .await?, + CLICKHOUSE_PORT, + ) + }; + let client = Client::new(db_address, &log); client.init_db().await?; // Spawn the task for aggregating and inserting all metrics @@ -334,18 +356,15 @@ impl OximeterAgent { /// Configuration used to initialize an oximeter server #[derive(Clone, Debug, Deserialize, Serialize)] pub struct Config { - /// An unique ID for this oximeter server - pub id: Uuid, - /// The address used to connect to Nexus. - pub nexus_address: SocketAddr, + /// + /// If "None", will be inferred from DNS. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub nexus_address: Option, /// Configuration for working with ClickHouse pub db: DbConfig, - /// The internal Dropshot HTTP server configuration - pub dropshot: ConfigDropshot, - /// Logging configuration pub log: ConfigLogging, } @@ -360,6 +379,11 @@ impl Config { } } +pub struct OximeterArguments { + pub id: Uuid, + pub address: SocketAddrV6, +} + /// A server used to collect metrics from components in the control plane. pub struct Oximeter { _agent: Arc, @@ -371,12 +395,15 @@ impl Oximeter { /// /// This starts an HTTP server used to communicate with other agents in Omicron, especially /// Nexus. It also registers itself as a new `oximeter` instance with Nexus. - pub async fn new(config: &Config) -> Result { + pub async fn new( + config: &Config, + args: &OximeterArguments, + ) -> Result { let log = config .log .to_logger("oximeter") .map_err(|msg| Error::Server(msg.to_string()))?; - Self::with_logger(config, log).await + Self::with_logger(config, args, log).await } /// Create a new `Oximeter`, specifying an alternative logger to use. @@ -385,6 +412,7 @@ impl Oximeter { /// `config`, using `log` instead. pub async fn with_logger( config: &Config, + args: &OximeterArguments, log: Logger, ) -> Result { let (drain, registration) = slog_dtrace::with_drain(log); @@ -398,10 +426,13 @@ impl Oximeter { } info!(log, "starting oximeter server"); + let resolver = Resolver::new_from_ip(*args.address.ip())?; + let make_agent = || async { debug!(log, "creating ClickHouse client"); Ok(Arc::new( - OximeterAgent::with_id(config.id, config.db, &log).await?, + OximeterAgent::with_id(args.id, config.db, &resolver, &log) + .await?, )) }; let log_client_failure = |error, delay| { @@ -421,7 +452,10 @@ impl Oximeter { let dropshot_log = log.new(o!("component" => "dropshot")); let server = HttpServerStarter::new( - &config.dropshot, + &ConfigDropshot { + bind_address: SocketAddr::V6(args.address), + ..Default::default() + }, oximeter_api(), Arc::clone(&agent), &dropshot_log, @@ -433,20 +467,33 @@ impl Oximeter { let client = reqwest::Client::new(); let notify_nexus = || async { debug!(log, "contacting nexus"); - client - .post(format!( - "http://{}/metrics/collectors", - config.nexus_address + let nexus_address = if let Some(address) = config.nexus_address { + address + } else { + SocketAddr::V6(SocketAddrV6::new( + resolver + .lookup_ipv6(SRV::Service(ServiceName::Nexus)) + .await + .map_err(|e| { + backoff::BackoffError::transient(e.to_string()) + })?, + NEXUS_INTERNAL_PORT, + 0, + 0, )) + }; + + client + .post(format!("http://{}/metrics/collectors", nexus_address,)) .json(&nexus_client::types::OximeterInfo { address: server.local_addr().to_string(), collector_id: agent.id, }) .send() .await - .map_err(backoff::BackoffError::transient)? + .map_err(|e| backoff::BackoffError::transient(e.to_string()))? .error_for_status() - .map_err(backoff::BackoffError::transient) + .map_err(|e| backoff::BackoffError::transient(e.to_string())) }; let log_notification_failure = |error, delay| { warn!( diff --git a/oximeter/collector/tests/output/cmd-oximeter-noargs-stderr b/oximeter/collector/tests/output/cmd-oximeter-noargs-stderr index 1398febf119..dfb062bca75 100644 --- a/oximeter/collector/tests/output/cmd-oximeter-noargs-stderr +++ b/oximeter/collector/tests/output/cmd-oximeter-noargs-stderr @@ -1,7 +1,13 @@ -error: The following required arguments were not provided: - +oximeter +See README.adoc for more information USAGE: - oximeter [OPTIONS] + oximeter -For more information try --help +OPTIONS: + -h, --help Print help information + +SUBCOMMANDS: + help Print this message or the help of the given subcommand(s) + openapi Print the external OpenAPI Spec document and exit + run Start an Oximeter server diff --git a/oximeter/collector/tests/test_commands.rs b/oximeter/collector/tests/test_commands.rs index 7b910a5be4a..d3d66be0580 100644 --- a/oximeter/collector/tests/test_commands.rs +++ b/oximeter/collector/tests/test_commands.rs @@ -50,7 +50,7 @@ fn test_oximeter_openapi() { // But we do know where it is at compile time, so we load it then. let config = include_str!("../../collector/config.toml"); let config_path = write_config(config); - let exec = Exec::cmd(path_to_oximeter()).arg(&config_path).arg("--openapi"); + let exec = Exec::cmd(path_to_oximeter()).arg("openapi"); let (exit_status, stdout_text, stderr_text) = run_command(exec); fs::remove_file(&config_path).expect("failed to remove temporary file"); assert_exit_code(exit_status, EXIT_SUCCESS, &stderr_text); diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index dde2ef47937..be11bfb2a6d 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -12,7 +12,7 @@ use crate::illumos::zone::AddressRequest; use crate::params::{ServiceEnsureBody, ServiceRequest, ServiceType}; use crate::zone::Zones; use dropshot::ConfigDropshot; -use omicron_common::address::{Ipv6Subnet, RACK_PREFIX}; +use omicron_common::address::{Ipv6Subnet, OXIMETER_PORT, RACK_PREFIX}; use omicron_common::nexus_config::{ self, DeploymentConfig as NexusDeploymentConfig, }; @@ -204,11 +204,11 @@ impl ServiceManager { existing_zones: &mut Vec, services: &Vec, ) -> Result<(), Error> { - info!(self.log, "Ensuring services are initialized: {:?}", services); // TODO(https://github.com/oxidecomputer/omicron/issues/726): // As long as we ensure the requests don't overlap, we could // parallelize this request. for service in services { + info!(self.log, "Ensuring service is initialized: {:?}", service); // Before we bother allocating anything for this request, check if // this service has already been created. let expected_zone_name = @@ -338,7 +338,7 @@ impl ServiceManager { database: nexus_config::Database::FromUrl { url: PostgresConfigWithUrl::from_str( "postgresql://root@[fd00:1122:3344:0101::2]:32221/omicron?sslmode=disable" - ).unwrap() + ).unwrap(), } }; @@ -434,8 +434,50 @@ impl ServiceManager { ServiceType::Oximeter => { info!(self.log, "Setting up oximeter service"); - // TODO: Implement with dynamic parameters, when address is - // dynamically assigned. + let address = service.addresses[0]; + running_zone + .run_cmd(&[ + crate::illumos::zone::SVCCFG, + "-s", + &smf_name, + "setprop", + &format!("config/id={}", service.id), + ]) + .map_err(|err| Error::ZoneCommand { + intent: "set server ID".to_string(), + err, + })?; + + running_zone + .run_cmd(&[ + crate::illumos::zone::SVCCFG, + "-s", + &smf_name, + "setprop", + &format!( + "config/address=[{}]:{}", + address, OXIMETER_PORT, + ), + ]) + .map_err(|err| Error::ZoneCommand { + intent: "set server address".to_string(), + err, + })?; + + running_zone + .run_cmd(&[ + crate::illumos::zone::SVCCFG, + "-s", + &default_smf_name, + "refresh", + ]) + .map_err(|err| Error::ZoneCommand { + intent: format!( + "Refresh SMF manifest {}", + default_smf_name + ), + err, + })?; } } @@ -494,7 +536,7 @@ impl ServiceManager { // that removal implicitly. warn!( self.log, - "Cannot request services on this sled, differing configurations: {:?}", + "Cannot request services on this sled, differing configurations: {:#?}", known_set.symmetric_difference(&requested_set) ); return Err(Error::ServicesAlreadyConfigured); diff --git a/smf/oximeter/config.toml b/smf/oximeter/config.toml index 4a0095fdd00..ca14fe6ec8b 100644 --- a/smf/oximeter/config.toml +++ b/smf/oximeter/config.toml @@ -1,11 +1,6 @@ # Example configuration file for running an oximeter collector server -id = "1da65e5b-210c-4859-a7d7-200c1e659972" -# Internal address of nexus -nexus_address = "[fd00:1122:3344:0101::3]:12221" - [db] -address = "[fd00:1122:3344:0101::5]:8123" batch_size = 1000 batch_interval = 5 # In seconds @@ -14,6 +9,3 @@ level = "debug" mode = "file" path = "/dev/stdout" if_exists = "append" - -[dropshot] -bind_address = "[fd00:1122:3344:0101::4]:12223" diff --git a/smf/oximeter/manifest.xml b/smf/oximeter/manifest.xml index 47e3cb254f1..d16efd90d99 100644 --- a/smf/oximeter/manifest.xml +++ b/smf/oximeter/manifest.xml @@ -18,10 +18,15 @@ + + + + +