Skip to content

Commit

Permalink
kubelet: Record a metric for latency of pod status update
Browse files Browse the repository at this point in the history
Track how long it takes for pod updates to propagate from detection
to successful change on API server. Will guide future improvements
in pod start and shutdown latency.
  • Loading branch information
smarterclayton committed Feb 3, 2022
1 parent 655bc69 commit b7d8ccc
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 3 deletions.
14 changes: 14 additions & 0 deletions pkg/kubelet/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ const (
PodStartDurationKey = "pod_start_duration_seconds"
CgroupManagerOperationsKey = "cgroup_manager_duration_seconds"
PodWorkerStartDurationKey = "pod_worker_start_duration_seconds"
PodStatusSyncDurationKey = "pod_status_sync_duration_seconds"
PLEGRelistDurationKey = "pleg_relist_duration_seconds"
PLEGDiscardEventsKey = "pleg_discard_events"
PLEGRelistIntervalKey = "pleg_relist_interval_seconds"
Expand Down Expand Up @@ -168,6 +169,18 @@ var (
StabilityLevel: metrics.ALPHA,
},
)
// PodStatusSyncDuration is a Histogram that tracks the duration (in seconds) in takes from the time a pod
// status is generated to the time it is synced with the apiserver.
PodStatusSyncDuration = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: KubeletSubsystem,
Name: PodStatusSyncDurationKey,
Help: "Duration in seconds to sync a pod status update. Measures time from detection to write.",
Buckets: []float64{0.010, 0.050, 0.100, 0.500, 1, 5, 10, 20, 30, 45, 60},
StabilityLevel: metrics.ALPHA,
},
[]string{"priority"},
)
// PLEGRelistDuration is a Histogram that tracks the duration (in seconds) it takes for relisting pods in the Kubelet's
// Pod Lifecycle Event Generator (PLEG).
PLEGRelistDuration = metrics.NewHistogram(
Expand Down Expand Up @@ -534,6 +547,7 @@ func Register(collectors ...metrics.StableCollector) {
legacyregistry.MustRegister(PodStartDuration)
legacyregistry.MustRegister(CgroupManagerDuration)
legacyregistry.MustRegister(PodWorkerStartDuration)
legacyregistry.MustRegister(PodStatusSyncDuration)
legacyregistry.MustRegister(ContainersPerPodCount)
legacyregistry.MustRegister(PLEGRelistDuration)
legacyregistry.MustRegister(PLEGDiscardEvents)
Expand Down
27 changes: 24 additions & 3 deletions pkg/kubelet/status/status_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"context"
"fmt"
"sort"
"strconv"
"strings"
"sync"
"time"
Expand All @@ -37,6 +38,7 @@ import (
"k8s.io/klog/v2"
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/metrics"
kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
statusutil "k8s.io/kubernetes/pkg/util/pod"
Expand All @@ -45,12 +47,15 @@ import (
// A wrapper around v1.PodStatus that includes a version to enforce that stale pod statuses are
// not sent to the API server.
type versionedPodStatus struct {
status v1.PodStatus
// Monotonically increasing version number (per pod).
// version is a monotonically increasing version number (per pod).
version uint64
// Pod name & namespace, for sending updates to API server.
podName string
podNamespace string
// at is the time at which the most recent status update was detected
at time.Time

status v1.PodStatus
}

type podStatusSyncRequest struct {
Expand Down Expand Up @@ -482,6 +487,13 @@ func (m *manager) updateStatusInternal(pod *v1.Pod, status v1.PodStatus, forceUp
podName: pod.Name,
podNamespace: pod.Namespace,
}

if cachedStatus.at.IsZero() {
newStatus.at = time.Now()
} else {
newStatus.at = cachedStatus.at
}

m.podStatuses[pod.UID] = newStatus

select {
Expand Down Expand Up @@ -621,7 +633,7 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {

oldStatus := pod.Status.DeepCopy()
newPod, patchBytes, unchanged, err := statusutil.PatchPodStatus(m.kubeClient, pod.Namespace, pod.Name, pod.UID, *oldStatus, mergePodStatus(*oldStatus, status.status))
klog.V(3).InfoS("Patch status for pod", "pod", klog.KObj(pod), "patch", string(patchBytes))
klog.V(3).InfoS("Patch status for pod", "pod", klog.KObj(pod), "podUID", uid, "patch", string(patchBytes))

if err != nil {
klog.InfoS("Failed to update status for pod", "pod", klog.KObj(pod), "err", err)
Expand All @@ -634,6 +646,15 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {
pod = newPod
}

// measure how long the status update took to propagate from generation to update on the server
var duration time.Duration
if status.at.IsZero() {
klog.V(3).InfoS("Pod had no status time set", "pod", klog.KObj(pod), "podUID", uid, "version", status.version)
} else {
duration = time.Now().Sub(status.at).Truncate(time.Millisecond)
}
metrics.PodStatusSyncDuration.WithLabelValues(strconv.Itoa(0)).Observe(duration.Seconds())

m.apiStatusVersions[kubetypes.MirrorPodUID(pod.UID)] = status.version

// We don't handle graceful deletion of mirror pods.
Expand Down

0 comments on commit b7d8ccc

Please sign in to comment.