Skip to content

Commit

Permalink
feat(metrics/net): add tap tx latency metric
Browse files Browse the repository at this point in the history
This metric measures how long Firecracker VMM thread
is blocked on the write syscalls when accessing a tap device.
By looking at it, we will be able to see what portion
of tx net latency is attributed to factors external
to Firecracker.

(cherry picked from commit f7d8a33)

Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
  • Loading branch information
kalyazin committed Mar 8, 2024
1 parent 8826f20 commit ae803c0
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 2 deletions.
1 change: 1 addition & 0 deletions src/vmm/src/devices/virtio/net/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,7 @@ impl Net {
});
}

let _metric = net_metrics.tap_write_agg.record_latency_metrics();
match Self::write_tap(tap, frame_iovec) {
Ok(_) => {
let len = frame_iovec.len() as u64;
Expand Down
17 changes: 15 additions & 2 deletions src/vmm/src/devices/virtio/net/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ use std::sync::{Arc, RwLock};
use serde::ser::SerializeMap;
use serde::{Serialize, Serializer};

use crate::logger::{IncMetric, SharedIncMetric};
use crate::logger::{IncMetric, LatencyAggregateMetrics, SharedIncMetric};

/// map of network interface id and metrics
/// this should be protected by a lock before accessing.
Expand All @@ -107,7 +107,7 @@ impl NetMetricsPerDevice {
.write()
.unwrap()
.metrics
.insert(iface_id.clone(), Arc::new(NetDeviceMetrics::default()));
.insert(iface_id.clone(), Arc::new(NetDeviceMetrics::new()));
}
METRICS
.read()
Expand Down Expand Up @@ -184,6 +184,8 @@ pub struct NetDeviceMetrics {
pub tap_read_fails: SharedIncMetric,
/// Number of times writing to TAP failed.
pub tap_write_fails: SharedIncMetric,
/// Duration of all tap write operations.
pub tap_write_agg: LatencyAggregateMetrics,
/// Number of transmitted bytes.
pub tx_bytes_count: SharedIncMetric,
/// Number of malformed TX frames.
Expand All @@ -207,6 +209,14 @@ pub struct NetDeviceMetrics {
}

impl NetDeviceMetrics {
/// Const default construction.
pub fn new() -> Self {
Self {
tap_write_agg: LatencyAggregateMetrics::new(),
..Default::default()
}
}

/// Net metrics are SharedIncMetric where the diff of current vs
/// old is serialized i.e. serialize_u64(current-old).
/// So to have the aggregate serialized in same way we need to
Expand Down Expand Up @@ -239,6 +249,9 @@ impl NetDeviceMetrics {
self.rx_count.add(other.rx_count.fetch_diff());
self.tap_read_fails.add(other.tap_read_fails.fetch_diff());
self.tap_write_fails.add(other.tap_write_fails.fetch_diff());
self.tap_write_agg
.sum_us
.add(other.tap_write_agg.sum_us.fetch_diff());
self.tx_bytes_count.add(other.tx_bytes_count.fetch_diff());
self.tx_malformed_frames
.add(other.tx_malformed_frames.fetch_diff());
Expand Down
3 changes: 3 additions & 0 deletions tests/host_tools/fcmetrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,9 @@ def validate_fc_metrics(metrics):
"read_agg",
"write_agg",
],
"net": [
"tap_write_agg",
],
}

# validate timestamp before jsonschema validation which some more time
Expand Down

0 comments on commit ae803c0

Please sign in to comment.