-
Notifications
You must be signed in to change notification settings - Fork 33
/
metrics.rs
324 lines (295 loc) · 10.7 KB
/
metrics.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
//! Metrics produced by the sled-agent for collection by oximeter.
use omicron_common::api::internal::nexus::ProducerEndpoint;
use omicron_common::api::internal::nexus::ProducerKind;
use oximeter::types::MetricsError;
use oximeter::types::ProducerRegistry;
use oximeter_producer::LogConfig;
use oximeter_producer::Server as ProducerServer;
use sled_hardware_types::Baseboard;
use slog::Logger;
use std::net::Ipv6Addr;
use std::net::SocketAddr;
use std::sync::Arc;
use std::time::Duration;
use uuid::Uuid;
cfg_if::cfg_if! {
if #[cfg(target_os = "illumos")] {
use oximeter_instruments::kstat::link;
use oximeter_instruments::kstat::CollectionDetails;
use oximeter_instruments::kstat::Error as KstatError;
use oximeter_instruments::kstat::KstatSampler;
use oximeter_instruments::kstat::TargetId;
use std::collections::BTreeMap;
use std::sync::Mutex;
} else {
use anyhow::anyhow;
}
}
/// The interval on which we ask `oximeter` to poll us for metric data.
pub(crate) const METRIC_COLLECTION_INTERVAL: Duration = Duration::from_secs(30);
/// The interval on which we sample link metrics.
pub(crate) const LINK_SAMPLE_INTERVAL: Duration = Duration::from_secs(10);
/// The maximum Dropshot request size for the metrics server.
const METRIC_REQUEST_MAX_SIZE: usize = 10 * 1024 * 1024;
/// An error during sled-agent metric production.
#[derive(Debug, thiserror::Error)]
pub enum Error {
#[cfg(target_os = "illumos")]
#[error("Kstat-based metric failure")]
Kstat(#[source] KstatError),
#[cfg(not(target_os = "illumos"))]
#[error("Kstat-based metric failure")]
Kstat(#[source] anyhow::Error),
#[error("Failed to insert metric producer into registry")]
Registry(#[source] MetricsError),
#[error("Failed to fetch hostname")]
Hostname(#[source] std::io::Error),
#[error("Non-UTF8 hostname")]
NonUtf8Hostname,
#[error("Missing NULL byte in hostname")]
HostnameMissingNull,
#[error("Failed to start metric producer server")]
ProducerServer(#[source] oximeter_producer::Error),
}
// Basic metadata about the sled agent used when publishing metrics.
#[derive(Clone, Debug)]
#[cfg_attr(not(target_os = "illumos"), allow(dead_code))]
struct SledIdentifiers {
sled_id: Uuid,
rack_id: Uuid,
baseboard: Baseboard,
}
/// Type managing all oximeter metrics produced by the sled-agent.
//
// TODO-completeness: We probably want to get kstats or other metrics in to this
// type from other parts of the code, possibly before the `SledAgent` itself
// exists. This is similar to the storage resources or other objects, most of
// which are essentially an `Arc<Mutex<Inner>>`. It would be nice to avoid that
// pattern, but until we have more statistics, it's not clear whether that's
// worth it right now.
#[derive(Clone)]
// NOTE: The ID fields aren't used on non-illumos systems, rather than changing
// the name of fields that are not yet used.
#[cfg_attr(not(target_os = "illumos"), allow(dead_code))]
pub struct MetricsManager {
metadata: Arc<SledIdentifiers>,
_log: Logger,
#[cfg(target_os = "illumos")]
kstat_sampler: KstatSampler,
// TODO-scalability: We may want to generalize this to store any kind of
// tracked target, and use a naming scheme that allows us pick out which
// target we're interested in from the arguments.
//
// For example, we can use the link name to do this, for any physical or
// virtual link, because they need to be unique. We could also do the same
// for disks or memory. If we wanted to guarantee uniqueness, we could
// namespace them internally, e.g., `"datalink:{link_name}"` would be the
// real key.
#[cfg(target_os = "illumos")]
tracked_links: Arc<Mutex<BTreeMap<String, TargetId>>>,
producer_server: Arc<ProducerServer>,
}
impl MetricsManager {
/// Construct a new metrics manager.
///
/// This takes a few key pieces of identifying information that are used
/// when reporting sled-specific metrics.
pub fn new(
sled_id: Uuid,
rack_id: Uuid,
baseboard: Baseboard,
sled_address: Ipv6Addr,
log: Logger,
) -> Result<Self, Error> {
let producer_server =
start_producer_server(&log, sled_id, sled_address)?;
cfg_if::cfg_if! {
if #[cfg(target_os = "illumos")] {
let kstat_sampler = KstatSampler::new(&log).map_err(Error::Kstat)?;
producer_server
.registry()
.register_producer(kstat_sampler.clone())
.map_err(Error::Registry)?;
let tracked_links = Arc::new(Mutex::new(BTreeMap::new()));
}
}
Ok(Self {
metadata: Arc::new(SledIdentifiers { sled_id, rack_id, baseboard }),
_log: log,
#[cfg(target_os = "illumos")]
kstat_sampler,
#[cfg(target_os = "illumos")]
tracked_links,
producer_server,
})
}
/// Return a reference to the contained producer registry.
pub fn registry(&self) -> &ProducerRegistry {
self.producer_server.registry()
}
}
/// Start a metric producer server.
fn start_producer_server(
log: &Logger,
sled_id: Uuid,
sled_address: Ipv6Addr,
) -> Result<Arc<ProducerServer>, Error> {
let log = log.new(slog::o!("component" => "producer-server"));
let registry = ProducerRegistry::with_id(sled_id);
// Listen on any available socket, using our underlay address.
let address = SocketAddr::new(sled_address.into(), 0);
// Resolve Nexus via DNS.
let registration_address = None;
let config = oximeter_producer::Config {
server_info: ProducerEndpoint {
id: registry.producer_id(),
kind: ProducerKind::SledAgent,
address,
base_route: String::new(), // Unused, will be removed.
interval: METRIC_COLLECTION_INTERVAL,
},
registration_address,
request_body_max_bytes: METRIC_REQUEST_MAX_SIZE,
log: LogConfig::Logger(log),
};
ProducerServer::start(&config).map(Arc::new).map_err(Error::ProducerServer)
}
#[cfg(target_os = "illumos")]
impl MetricsManager {
/// Track metrics for a physical datalink.
pub async fn track_physical_link(
&self,
link_name: impl AsRef<str>,
interval: Duration,
) -> Result<(), Error> {
let hostname = hostname()?;
let link = link::PhysicalDataLink {
rack_id: self.metadata.rack_id,
sled_id: self.metadata.sled_id,
serial: self.serial_number(),
hostname,
link_name: link_name.as_ref().to_string(),
};
let details = CollectionDetails::never(interval);
let id = self
.kstat_sampler
.add_target(link, details)
.await
.map_err(Error::Kstat)?;
self.tracked_links
.lock()
.unwrap()
.insert(link_name.as_ref().to_string(), id);
Ok(())
}
/// Stop tracking metrics for a datalink.
///
/// This works for both physical and virtual links.
#[allow(dead_code)]
pub async fn stop_tracking_link(
&self,
link_name: impl AsRef<str>,
) -> Result<(), Error> {
let maybe_id =
self.tracked_links.lock().unwrap().remove(link_name.as_ref());
if let Some(id) = maybe_id {
self.kstat_sampler.remove_target(id).await.map_err(Error::Kstat)
} else {
Ok(())
}
}
/// Track metrics for a virtual datalink.
#[allow(dead_code)]
pub async fn track_virtual_link(
&self,
link_name: impl AsRef<str>,
hostname: impl AsRef<str>,
interval: Duration,
) -> Result<(), Error> {
let link = link::VirtualDataLink {
rack_id: self.metadata.rack_id,
sled_id: self.metadata.sled_id,
serial: self.serial_number(),
hostname: hostname.as_ref().to_string(),
link_name: link_name.as_ref().to_string(),
};
let details = CollectionDetails::never(interval);
self.kstat_sampler
.add_target(link, details)
.await
.map(|_| ())
.map_err(Error::Kstat)
}
// Return the serial number out of the baseboard, if one exists.
fn serial_number(&self) -> String {
match &self.metadata.baseboard {
Baseboard::Gimlet { identifier, .. } => identifier.clone(),
Baseboard::Unknown => String::from("unknown"),
Baseboard::Pc { identifier, .. } => identifier.clone(),
}
}
}
#[cfg(not(target_os = "illumos"))]
impl MetricsManager {
/// Track metrics for a physical datalink.
pub async fn track_physical_link(
&self,
_link_name: impl AsRef<str>,
_interval: Duration,
) -> Result<(), Error> {
Err(Error::Kstat(anyhow!(
"kstat metrics are not supported on this platform"
)))
}
/// Stop tracking metrics for a datalink.
///
/// This works for both physical and virtual links.
#[allow(dead_code)]
pub async fn stop_tracking_link(
&self,
_link_name: impl AsRef<str>,
) -> Result<(), Error> {
Err(Error::Kstat(anyhow!(
"kstat metrics are not supported on this platform"
)))
}
/// Track metrics for a virtual datalink.
#[allow(dead_code)]
pub async fn track_virtual_link(
&self,
_link_name: impl AsRef<str>,
_hostname: impl AsRef<str>,
_interval: Duration,
) -> Result<(), Error> {
Err(Error::Kstat(anyhow!(
"kstat metrics are not supported on this platform"
)))
}
}
// Return the current hostname if possible.
#[cfg(target_os = "illumos")]
fn hostname() -> Result<String, Error> {
// See netdb.h
const MAX_LEN: usize = 256;
let mut out = vec![0u8; MAX_LEN + 1];
if unsafe {
libc::gethostname(out.as_mut_ptr() as *mut libc::c_char, MAX_LEN)
} == 0
{
// Split into subslices by NULL bytes.
//
// We should have a NULL byte, since we've asked for no more than 255
// bytes in a 256 byte buffer, but you never know.
let Some(chunk) = out.split(|x| *x == 0).next() else {
return Err(Error::HostnameMissingNull);
};
let s = std::ffi::CString::new(chunk)
.map_err(|_| Error::NonUtf8Hostname)?;
s.into_string().map_err(|_| Error::NonUtf8Hostname)
} else {
Err(std::io::Error::last_os_error()).map_err(|_| Error::NonUtf8Hostname)
}
}