Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug 1874521: connectivity check metrics cannot be aggregated across components #933

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 1 addition & 2 deletions pkg/cmd/checkendpoints/cmd.go
Expand Up @@ -3,7 +3,6 @@ package checkendpoints
import (
"context"
"os"
"strings"
"time"

operatorcontrolplaneclient "github.com/openshift/client-go/operatorcontrolplane/clientset/versioned"
Expand Down Expand Up @@ -31,7 +30,7 @@ func NewCheckEndpointsCommand() *cobra.Command {
kubeInformers.Core().V1().Secrets(),
cctx.EventRecorder,
)
controller.RegisterMetrics(strings.Replace(namespace, "-", "_", -1) + "_")
controller.RegisterMetrics()
operatorcontrolplaneInformers.Start(ctx.Done())
kubeInformers.Start(ctx.Done())
go check.Run(ctx, 1)
Expand Down
10 changes: 6 additions & 4 deletions pkg/cmd/checkendpoints/controller/connection_checker.go
Expand Up @@ -32,7 +32,7 @@ type ConnectionChecker interface {
type GetCheckFunc func() *operatorcontrolplanev1alpha1.PodNetworkConnectivityCheck

// NewConnectionChecker returns a ConnectionChecker.
func NewConnectionChecker(name, podName string, getCheck GetCheckFunc, client v1alpha1helpers.PodNetworkConnectivityCheckClient, clientCertGetter CertificatesGetter, recorder events.Recorder) ConnectionChecker {
func NewConnectionChecker(name, podName, podNamespace string, getCheck GetCheckFunc, client v1alpha1helpers.PodNetworkConnectivityCheckClient, clientCertGetter CertificatesGetter, recorder events.Recorder) ConnectionChecker {
return &connectionChecker{
name: name,
podName: podName,
Expand All @@ -42,6 +42,7 @@ func NewConnectionChecker(name, podName string, getCheck GetCheckFunc, client v1
recorder: recorder,
updates: NewUpdatesManager(checkPeriod, checkTimeout, newUpdatesProcessor(client, name)),
stop: make(chan interface{}),
metrics: NewMetricsContext(podNamespace, name),
}
}

Expand All @@ -64,6 +65,7 @@ type connectionChecker struct {
recorder events.Recorder
updates UpdatesManager
stop chan interface{}
metrics MetricsContext
}

// checkConnection checks the connection periodically, updating status as needed
Expand Down Expand Up @@ -149,7 +151,7 @@ func (c *connectionChecker) getTCPConnectLatency(ctx context.Context, address st
}
tcpConn, err := dialer.DialContext(ctx, "tcp", address)
if err != nil {
updateMetrics(address, latencyInfo, err)
c.metrics.Update(address, latencyInfo, err)
return latencyInfo, err
}

Expand All @@ -160,14 +162,14 @@ func (c *connectionChecker) getTCPConnectLatency(ctx context.Context, address st
// ignore any error. most likely non-tls connection, plus we're not really testing tls
klog.V(4).Infof("%s: tls error ignored: %v", address, err)
_ = tcpConn.Close()
updateMetrics(address, latencyInfo, nil)
c.metrics.Update(address, latencyInfo, nil)
return latencyInfo, nil
}

// gracefully close connection (ignore error)
_ = tlsConn.Close()

updateMetrics(address, latencyInfo, err)
c.metrics.Update(address, latencyInfo, err)
return latencyInfo, err
}

Expand Down
67 changes: 47 additions & 20 deletions pkg/cmd/checkendpoints/controller/metrics.go
Expand Up @@ -16,44 +16,63 @@ var (
dnsResolveLatencyGauge *metrics.GaugeVec
)

func RegisterMetrics(prefix string) {
// RegisterMetrics in the global registry
func RegisterMetrics() {
registerMetrics.Do(func() {
endpointCheckCounter = metrics.NewCounterVec(&metrics.CounterOpts{
Name: prefix + "endpoint_check_count",
Help: "Report status of endpoint checks for each pod over time.",
}, []string{"endpoint", "tcpConnect", "dnsResolve"})
Name: "pod_network_connectivity_check_count",
Help: "Report status of pod network connectivity checks over time.",
}, []string{"component", "checkName", "targetEndpoint", "tcpConnect", "dnsResolve"})

tcpConnectLatencyGauge = metrics.NewGaugeVec(&metrics.GaugeOpts{
Name: prefix + "endpoint_check_tcp_connect_latency_gauge",
Help: "Report latency of TCP connect to endpoint for each pod over time.",
}, []string{"endpoint"})
Name: "pod_network_connectivity_check_tcp_connect_latency_gauge",
Help: "Report latency of TCP connect to target endpoint over time.",
}, []string{"component", "checkName", "targetEndpoint"})

dnsResolveLatencyGauge = metrics.NewGaugeVec(&metrics.GaugeOpts{
Name: prefix + "endpoint_check_dns_resolve_latency_gauge",
Help: "Report latency of DNS resolve of endpoint for each pod over time.",
}, []string{"endpoint"})
Name: "pod_network_connectivity_check_dns_resolve_latency_gauge",
Help: "Report latency of DNS resolve of target endpoint over time.",
}, []string{"component", "checkName", "targetEndpoint"})
legacyregistry.MustRegister(endpointCheckCounter)
legacyregistry.MustRegister(tcpConnectLatencyGauge)
legacyregistry.MustRegister(dnsResolveLatencyGauge)
})
}

func updateMetrics(address string, latency *trace.LatencyInfo, checkErr error) {
endpointCheckCounter.With(getCounterMetricLabels(address, latency, checkErr)).Inc()
// MetricsContext updates connectivity check metrics
type MetricsContext interface {
Update(targetEndpoint string, latency *trace.LatencyInfo, checkErr error)
}

type metricsContext struct {
componentName string
checkName string
}

func NewMetricsContext(componentName, checkName string) *metricsContext {
RegisterMetrics()
return &metricsContext{
componentName: componentName,
checkName: checkName,
}

}

// Update the pod network connectivity check metrics for the given check results.
func (m *metricsContext) Update(targetEndpoint string, latency *trace.LatencyInfo, checkErr error) {
endpointCheckCounter.With(m.getCounterMetricLabels(targetEndpoint, latency, checkErr)).Inc()
if latency.Connect > 0 {
tcpConnectLatencyGauge.WithLabelValues(address).Set(float64(latency.Connect.Nanoseconds()))
tcpConnectLatencyGauge.With(m.getMetricLabels(targetEndpoint)).Set(float64(latency.Connect.Nanoseconds()))
}
if latency.DNS > 0 {
dnsResolveLatencyGauge.WithLabelValues(address).Set(float64(latency.DNS.Nanoseconds()))
dnsResolveLatencyGauge.With(m.getMetricLabels(targetEndpoint)).Set(float64(latency.DNS.Nanoseconds()))
}
}

func getCounterMetricLabels(address string, latency *trace.LatencyInfo, checkErr error) map[string]string {
labels := map[string]string{
"endpoint": address,
"dnsResolve": "",
"tcpConnect": "",
}
func (m *metricsContext) getCounterMetricLabels(targetEndpoint string, latency *trace.LatencyInfo, checkErr error) map[string]string {
labels := m.getMetricLabels(targetEndpoint)
labels["dnsResolve"] = ""
labels["tcpConnect"] = ""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can think about choosing a non-empty value to indicate the default/initial condition? In general, it's safer to avoid an empty string - no need to worry about how prometheus handles non existing and empty label.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels:

A label with an empty label value is considered equivalent to a label that does not exist.

if isDNSError(checkErr) {
labels["dnsResolve"] = "failure"
return labels
Expand All @@ -68,3 +87,11 @@ func getCounterMetricLabels(address string, latency *trace.LatencyInfo, checkErr
labels["tcpConnect"] = "success"
return labels
}

func (m *metricsContext) getMetricLabels(targetEndpoint string) map[string]string {
return map[string]string{
"component": m.componentName,
"checkName": m.checkName,
"targetEndpoint": targetEndpoint,
}
}
Expand Up @@ -83,7 +83,7 @@ func (c *controller) Sync(ctx context.Context, syncContext factory.SyncContext)
// create & start status updaters if needed
for _, check := range checks {
if updater := c.updaters[check.Name]; updater == nil {
c.updaters[check.Name] = NewConnectionChecker(check.Name, c.podName, c.newCheckFunc(check.Name), c, c.getClientCerts(check), c.recorder)
c.updaters[check.Name] = NewConnectionChecker(check.Name, c.podName, c.podNamespace, c.newCheckFunc(check.Name), c, c.getClientCerts(check), c.recorder)
go c.updaters[check.Name].Run(ctx)
}
}
Expand Down