Skip to content

Commit

Permalink
check-endpoints: refactor metric names and labels
Browse files Browse the repository at this point in the history
  • Loading branch information
sanchezl committed Aug 18, 2020
1 parent 57a1aa9 commit fc3699d
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 27 deletions.
3 changes: 1 addition & 2 deletions pkg/cmd/checkendpoints/cmd.go
Expand Up @@ -3,7 +3,6 @@ package checkendpoints
import (
"context"
"os"
"strings"
"time"

operatorcontrolplaneclient "github.com/openshift/client-go/operatorcontrolplane/clientset/versioned"
Expand Down Expand Up @@ -31,7 +30,7 @@ func NewCheckEndpointsCommand() *cobra.Command {
kubeInformers.Core().V1().Secrets(),
cctx.EventRecorder,
)
controller.RegisterMetrics(strings.Replace(namespace, "-", "_", -1) + "_")
controller.RegisterMetrics()
operatorcontrolplaneInformers.Start(ctx.Done())
kubeInformers.Start(ctx.Done())
go check.Run(ctx, 1)
Expand Down
10 changes: 6 additions & 4 deletions pkg/cmd/checkendpoints/controller/connection_checker.go
Expand Up @@ -32,7 +32,7 @@ type ConnectionChecker interface {
type GetCheckFunc func() *operatorcontrolplanev1alpha1.PodNetworkConnectivityCheck

// NewConnectionChecker returns a ConnectionChecker.
func NewConnectionChecker(name, podName string, getCheck GetCheckFunc, client v1alpha1helpers.PodNetworkConnectivityCheckClient, clientCertGetter CertificatesGetter, recorder events.Recorder) ConnectionChecker {
func NewConnectionChecker(name, podName, podNamespace string, getCheck GetCheckFunc, client v1alpha1helpers.PodNetworkConnectivityCheckClient, clientCertGetter CertificatesGetter, recorder events.Recorder) ConnectionChecker {
return &connectionChecker{
name: name,
podName: podName,
Expand All @@ -42,6 +42,7 @@ func NewConnectionChecker(name, podName string, getCheck GetCheckFunc, client v1
recorder: recorder,
updates: NewUpdatesManager(checkPeriod, checkTimeout, newUpdatesProcessor(client, name)),
stop: make(chan interface{}),
metrics: NewMetricsContext(podNamespace, name),
}
}

Expand All @@ -64,6 +65,7 @@ type connectionChecker struct {
recorder events.Recorder
updates UpdatesManager
stop chan interface{}
metrics MetricsContext
}

// checkConnection checks the connection periodically, updating status as needed
Expand Down Expand Up @@ -149,7 +151,7 @@ func (c *connectionChecker) getTCPConnectLatency(ctx context.Context, address st
}
tcpConn, err := dialer.DialContext(ctx, "tcp", address)
if err != nil {
updateMetrics(address, latencyInfo, err)
c.metrics.Update(address, latencyInfo, err)
return latencyInfo, err
}

Expand All @@ -160,14 +162,14 @@ func (c *connectionChecker) getTCPConnectLatency(ctx context.Context, address st
// ignore any error. most likely non-tls connection, plus we're not really testing tls
klog.V(4).Infof("%s: tls error ignored: %v", address, err)
_ = tcpConn.Close()
updateMetrics(address, latencyInfo, nil)
c.metrics.Update(address, latencyInfo, nil)
return latencyInfo, nil
}

// gracefully close connection (ignore error)
_ = tlsConn.Close()

updateMetrics(address, latencyInfo, err)
c.metrics.Update(address, latencyInfo, err)
return latencyInfo, err
}

Expand Down
67 changes: 47 additions & 20 deletions pkg/cmd/checkendpoints/controller/metrics.go
Expand Up @@ -16,44 +16,63 @@ var (
dnsResolveLatencyGauge *metrics.GaugeVec
)

func RegisterMetrics(prefix string) {
// RegisterMetrics in the global registry
func RegisterMetrics() {
registerMetrics.Do(func() {
endpointCheckCounter = metrics.NewCounterVec(&metrics.CounterOpts{
Name: prefix + "endpoint_check_count",
Help: "Report status of endpoint checks for each pod over time.",
}, []string{"endpoint", "tcpConnect", "dnsResolve"})
Name: "pod_network_connectivity_check_count",
Help: "Report status of pod network connectivity checks over time.",
}, []string{"component", "checkName", "targetEndpoint", "tcpConnect", "dnsResolve"})

tcpConnectLatencyGauge = metrics.NewGaugeVec(&metrics.GaugeOpts{
Name: prefix + "endpoint_check_tcp_connect_latency_gauge",
Help: "Report latency of TCP connect to endpoint for each pod over time.",
}, []string{"endpoint"})
Name: "pod_network_connectivity_check_tcp_connect_latency_gauge",
Help: "Report latency of TCP connect to target endpoint over time.",
}, []string{"component", "checkName", "targetEndpoint"})

dnsResolveLatencyGauge = metrics.NewGaugeVec(&metrics.GaugeOpts{
Name: prefix + "endpoint_check_dns_resolve_latency_gauge",
Help: "Report latency of DNS resolve of endpoint for each pod over time.",
}, []string{"endpoint"})
Name: "pod_network_connectivity_check_dns_resolve_latency_gauge",
Help: "Report latency of DNS resolve of target endpoint over time.",
}, []string{"component", "checkName", "endpoint"})
legacyregistry.MustRegister(endpointCheckCounter)
legacyregistry.MustRegister(tcpConnectLatencyGauge)
legacyregistry.MustRegister(dnsResolveLatencyGauge)
})
}

func updateMetrics(address string, latency *trace.LatencyInfo, checkErr error) {
endpointCheckCounter.With(getCounterMetricLabels(address, latency, checkErr)).Inc()
// MetricsContext updates connectivity check metrics
type MetricsContext interface {
Update(targetEndpoint string, latency *trace.LatencyInfo, checkErr error)
}

type metricsContext struct {
componentName string
checkName string
}

func NewMetricsContext(componentName, checkName string) *metricsContext {
RegisterMetrics()
return &metricsContext{
componentName: componentName,
checkName: checkName,
}

}

// Update the pod network connectivity check metrics for the given check results.
func (m *metricsContext) Update(targetEndpoint string, latency *trace.LatencyInfo, checkErr error) {
endpointCheckCounter.With(m.getCounterMetricLabels(targetEndpoint, latency, checkErr)).Inc()
if latency.Connect > 0 {
tcpConnectLatencyGauge.WithLabelValues(address).Set(float64(latency.Connect.Nanoseconds()))
tcpConnectLatencyGauge.With(m.getMetricLabels(targetEndpoint)).Set(float64(latency.Connect.Nanoseconds()))
}
if latency.DNS > 0 {
dnsResolveLatencyGauge.WithLabelValues(address).Set(float64(latency.DNS.Nanoseconds()))
dnsResolveLatencyGauge.With(m.getMetricLabels(targetEndpoint)).Set(float64(latency.DNS.Nanoseconds()))
}
}

func getCounterMetricLabels(address string, latency *trace.LatencyInfo, checkErr error) map[string]string {
labels := map[string]string{
"endpoint": address,
"dnsResolve": "",
"tcpConnect": "",
}
func (m *metricsContext) getCounterMetricLabels(targetEndpoint string, latency *trace.LatencyInfo, checkErr error) map[string]string {
labels := m.getMetricLabels(targetEndpoint)
labels["dnsResolve"] = ""
labels["tcpConnect"] = ""
if isDNSError(checkErr) {
labels["dnsResolve"] = "failure"
return labels
Expand All @@ -68,3 +87,11 @@ func getCounterMetricLabels(address string, latency *trace.LatencyInfo, checkErr
labels["tcpConnect"] = "success"
return labels
}

func (m *metricsContext) getMetricLabels(targetEndpoint string) map[string]string {
return map[string]string{
"component": m.componentName,
"checkName": m.checkName,
"targetEndpoint": targetEndpoint,
}
}
Expand Up @@ -83,7 +83,7 @@ func (c *controller) Sync(ctx context.Context, syncContext factory.SyncContext)
// create & start status updaters if needed
for _, check := range checks {
if updater := c.updaters[check.Name]; updater == nil {
c.updaters[check.Name] = NewConnectionChecker(check.Name, c.podName, c.newCheckFunc(check.Name), c, c.getClientCerts(check), c.recorder)
c.updaters[check.Name] = NewConnectionChecker(check.Name, c.podName, c.podNamespace, c.newCheckFunc(check.Name), c, c.getClientCerts(check), c.recorder)
go c.updaters[check.Name].Run(ctx)
}
}
Expand Down

0 comments on commit fc3699d

Please sign in to comment.