Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add performance metrics for initial sync and netpol #3450

Merged
merged 2 commits into from
Mar 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 7 additions & 1 deletion contrib/kind.sh
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ usage() {
echo "-ric | --run-in-container Configure the script to be run from a docker container, allowing it to still communicate with the kind controlplane"
echo "-ehp | --egress-ip-healthcheck-port TCP port used for gRPC session by egress IP node check. DEFAULT: 9107 (Use "0" for legacy dial to port 9)."
echo "-is | --ipsec Enable IPsec encryption (spawns ovn-ipsec pods)"
echo "-sm | --scale-metrics Enable scale metrics"
echo "--isolated Deploy with an isolated environment (no default gateway)"
echo "--delete Delete current cluster"
echo "--deploy Deploy ovn kubernetes without restarting kind"
Expand Down Expand Up @@ -295,6 +296,8 @@ parse_args() {
fi
OVN_EGRESSIP_HEALTHCHECK_PORT=$1
;;
-sm | --scale-metrics ) OVN_METRICS_SCALE_ENABLE=true
;;
--isolated ) OVN_ISOLATED=true
;;
-mne | --multi-network-enable ) shift
Expand Down Expand Up @@ -361,6 +364,7 @@ print_params() {
echo "OVN_EX_GW_NETWORK_INTERFACE = $OVN_EX_GW_NETWORK_INTERFACE"
echo "OVN_EGRESSIP_HEALTHCHECK_PORT = $OVN_EGRESSIP_HEALTHCHECK_PORT"
echo "OVN_DEPLOY_PODS = $OVN_DEPLOY_PODS"
echo "OVN_METRICS_SCALE_ENABLE = $OVN_METRICS_SCALE_ENABLE"
echo "OVN_ISOLATED = $OVN_ISOLATED"
echo "ENABLE_MULTI_NET = $ENABLE_MULTI_NET"
echo "OVN_SEPARATE_CLUSTER_MANAGER = $OVN_SEPARATE_CLUSTER_MANAGER"
Expand Down Expand Up @@ -510,6 +514,7 @@ set_default_params() {
OVN_EGRESSIP_HEALTHCHECK_PORT=${OVN_EGRESSIP_HEALTHCHECK_PORT:-9107}
OCI_BIN=${KIND_EXPERIMENTAL_PROVIDER:-docker}
OVN_DEPLOY_PODS=${OVN_DEPLOY_PODS:-"ovnkube-master ovnkube-node"}
OVN_METRICS_SCALE_ENABLE=${OVN_METRICS_SCALE_ENABLE:-false}
OVN_ISOLATED=${OVN_ISOLATED:-false}
OVN_GATEWAY_OPTS=""
if [ "$OVN_ISOLATED" == true ]; then
Expand Down Expand Up @@ -721,7 +726,8 @@ create_ovn_kube_manifests() {
--v4-join-subnet="${JOIN_SUBNET_IPV4}" \
--v6-join-subnet="${JOIN_SUBNET_IPV6}" \
--ex-gw-network-interface="${OVN_EX_GW_NETWORK_INTERFACE}" \
--multi-network-enable=${ENABLE_MULTI_NET}
--multi-network-enable="${ENABLE_MULTI_NET}" \
--ovnkube-metrics-scale-enable="${OVN_METRICS_SCALE_ENABLE}"
popd
}

Expand Down
7 changes: 7 additions & 0 deletions dist/images/daemonset.sh
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ OVN_HOST_NETWORK_NAMESPACE=""
OVN_EX_GW_NETWORK_INTERFACE=""
OVNKUBE_NODE_MGMT_PORT_NETDEV=""
OVNKUBE_CONFIG_DURATION_ENABLE=
OVNKUBE_METRICS_SCALE_ENABLE=
# IN_UPGRADE is true only if called by upgrade-ovn.sh during the upgrade test,
# it will render only the parts in ovn-setup.yaml related to RBAC permissions.
IN_UPGRADE=
Expand Down Expand Up @@ -263,6 +264,9 @@ while [ "$1" != "" ]; do
--ovnkube-config-duration-enable)
OVNKUBE_CONFIG_DURATION_ENABLE=$VALUE
;;
--ovnkube-metrics-scale-enable)
OVNKUBE_METRICS_SCALE_ENABLE=$VALUE
;;
--in-upgrade)
IN_UPGRADE=true
;;
Expand Down Expand Up @@ -405,6 +409,8 @@ ovnkube_node_mgmt_port_netdev=${OVNKUBE_NODE_MGMT_PORT_NETDEV}
echo "ovnkube_node_mgmt_port_netdev: ${ovnkube_node_mgmt_port_netdev}"
ovnkube_config_duration_enable=${OVNKUBE_CONFIG_DURATION_ENABLE}
echo "ovnkube_config_duration_enable: ${ovnkube_config_duration_enable}"
ovnkube_metrics_scale_enable=${OVNKUBE_METRICS_SCALE_ENABLE}
echo "ovnkube_metrics_scale_enable: ${ovnkube_metrics_scale_enable}"

ovn_image=${image} \
ovn_image_pull_policy=${image_pull_policy} \
Expand Down Expand Up @@ -486,6 +492,7 @@ ovn_image=${image} \
ovnkube_logfile_maxbackups=${ovnkube_logfile_maxbackups} \
ovnkube_logfile_maxage=${ovnkube_logfile_maxage} \
ovnkube_config_duration_enable=${ovnkube_config_duration_enable} \
ovnkube_metrics_scale_enable=${ovnkube_metrics_scale_enable} \
ovn_acl_logging_rate_limit=${ovn_acl_logging_rate_limit} \
ovn_hybrid_overlay_net_cidr=${ovn_hybrid_overlay_net_cidr} \
ovn_hybrid_overlay_enable=${ovn_hybrid_overlay_enable} \
Expand Down
8 changes: 8 additions & 0 deletions dist/images/ovnkube.sh
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ ovnkube_node_mode=${OVNKUBE_NODE_MODE:-"full"}
# OVNKUBE_NODE_MGMT_PORT_NETDEV - is the net device to be used for management port
ovnkube_node_mgmt_port_netdev=${OVNKUBE_NODE_MGMT_PORT_NETDEV:-}
ovnkube_config_duration_enable=${OVNKUBE_CONFIG_DURATION_ENABLE:-false}
ovnkube_metrics_scale_enable=${OVNKUBE_METRICS_SCALE_ENABLE:-false}
# OVN_ENCAP_IP - encap IP to be used for OVN traffic on the node
ovn_encap_ip=${OVN_ENCAP_IP:-}

Expand Down Expand Up @@ -993,6 +994,12 @@ ovn-master() {
fi
echo "ovnkube_config_duration_enable_flag: ${ovnkube_config_duration_enable_flag}"

ovnkube_metrics_scale_enable_flag=
if [[ ${ovnkube_metrics_scale_enable} == "true" ]]; then
ovnkube_metrics_scale_enable_flag="--metrics-enable-scale"
fi
echo "ovnkube_metrics_scale_enable_flag: ${ovnkube_metrics_scale_enable_flag}"

echo "=============== ovn-master ========== MASTER ONLY"
/usr/bin/ovnkube \
--init-master ${K8S_NODE} \
Expand All @@ -1019,6 +1026,7 @@ ovn-master() {
${egressfirewall_enabled_flag} \
${egressqos_enabled_flag} \
${ovnkube_config_duration_enable_flag} \
${ovnkube_metrics_scale_enable_flag} \
${multi_network_enabled_flag} \
--metrics-bind-address ${ovnkube_master_metrics_bind_address} \
--host-network-namespace ${ovn_host_network_namespace} &
Expand Down
2 changes: 2 additions & 0 deletions dist/templates/ovnkube-master.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,8 @@ spec:
value: "{{ ovnkube_logfile_maxage }}"
- name: OVNKUBE_CONFIG_DURATION_ENABLE
value: "{{ ovnkube_config_duration_enable }}"
- name: OVNKUBE_METRICS_SCALE_ENABLE
value: "{{ ovnkube_metrics_scale_enable }}"
- name: OVN_NET_CIDR
valueFrom:
configMapKeyRef:
Expand Down
10 changes: 5 additions & 5 deletions go-controller/pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -333,8 +333,8 @@ type MetricsConfig struct {
NodeServerCert string `gcfg:"node-server-cert"`
// EnableConfigDuration holds the boolean flag to enable OVN-Kubernetes master to monitor OVN-Kubernetes master
// configuration duration and optionally, its application to all nodes
EnableConfigDuration bool `gcfg:"enable-config-duration"`
EnableEIPScaleMetrics bool `gcfg:"enable-eip-scale-metrics"`
EnableConfigDuration bool `gcfg:"enable-config-duration"`
EnableScaleMetrics bool `gcfg:"enable-scale-metrics"`
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

flavio had added the previous EIP metrics flag:

    Add flags to explicitly enable the histogram metrics, since we only see value
    in having them when scale testing egress ips. The flag introduced here is:
        --metrics-enable-eip-scale
    
    Signed-off-by: Flavio Fernandes <flaviof@redhat.com>

@flavio-fernandes are you ok with changing this?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@trozet yes. we can change it w/out any problems.

}

// OVNKubernetesFeatureConfig holds OVN-Kubernetes feature enhancement config file parameters and command-line overrides
Expand Down Expand Up @@ -1041,9 +1041,9 @@ var MetricsFlags = []cli.Flag{
Destination: &cliConfig.Metrics.EnableConfigDuration,
},
&cli.BoolFlag{
Name: "metrics-enable-eip-scale",
Usage: "Enables metrics related to Egress IP scaling",
Destination: &cliConfig.Metrics.EnableEIPScaleMetrics,
Name: "metrics-enable-scale",
Usage: "Enables metrics related to scaling",
Destination: &cliConfig.Metrics.EnableScaleMetrics,
},
}

Expand Down
6 changes: 3 additions & 3 deletions go-controller/pkg/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ enable-pprof=true
node-server-privkey=/path/to/node-metrics-private.key
node-server-cert=/path/to/node-metrics.crt
enable-config-duration=true
enable-eip-scale-metrics=true
enable-scale-metrics=true

[logging]
loglevel=5
Expand Down Expand Up @@ -585,7 +585,7 @@ var _ = Describe("Config Operations", func() {
gomega.Expect(Metrics.NodeServerPrivKey).To(gomega.Equal("/path/to/node-metrics-private.key"))
gomega.Expect(Metrics.NodeServerCert).To(gomega.Equal("/path/to/node-metrics.crt"))
gomega.Expect(Metrics.EnableConfigDuration).To(gomega.Equal(true))
gomega.Expect(Metrics.EnableEIPScaleMetrics).To(gomega.Equal(true))
gomega.Expect(Metrics.EnableScaleMetrics).To(gomega.Equal(true))

gomega.Expect(OvnNorth.Scheme).To(gomega.Equal(OvnDBSchemeSSL))
gomega.Expect(OvnNorth.PrivKey).To(gomega.Equal("/path/to/nb-client-private.key"))
Expand Down Expand Up @@ -673,7 +673,7 @@ var _ = Describe("Config Operations", func() {
gomega.Expect(Metrics.NodeServerPrivKey).To(gomega.Equal("/tls/nodeprivkey"))
gomega.Expect(Metrics.NodeServerCert).To(gomega.Equal("/tls/nodecert"))
gomega.Expect(Metrics.EnableConfigDuration).To(gomega.Equal(true))
gomega.Expect(Metrics.EnableEIPScaleMetrics).To(gomega.Equal(true))
gomega.Expect(Metrics.EnableScaleMetrics).To(gomega.Equal(true))

gomega.Expect(OvnNorth.Scheme).To(gomega.Equal(OvnDBSchemeSSL))
gomega.Expect(OvnNorth.PrivKey).To(gomega.Equal("/client/privkey"))
Expand Down
105 changes: 104 additions & 1 deletion go-controller/pkg/metrics/master.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,17 @@ var MetricMasterReadyDuration = prometheus.NewGauge(prometheus.GaugeOpts{
Help: "The duration for the master to get to ready state",
})

// MetricMasterSyncDuration is the time taken to complete initial Watch for different resource.
// Resource name is in the label.
var MetricMasterSyncDuration = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricOvnkubeNamespace,
Subsystem: MetricOvnkubeSubsystemMaster,
Name: "sync_duration_seconds",
Help: "The duration to sync and setup all handlers for a given resource"},
[]string{
"resource_name",
})

// MetricMasterLeader identifies whether this instance of ovnkube-master is a leader or not
var MetricMasterLeader = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: MetricOvnkubeNamespace,
Expand Down Expand Up @@ -190,6 +201,66 @@ var metricEgressIPRebalanceCount = prometheus.NewCounter(prometheus.CounterOpts{
Help: "The total number of times assigned egress IP(s) needed to be moved to a different node"},
)

var metricNetpolEventLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: MetricOvnkubeNamespace,
Subsystem: MetricOvnkubeSubsystemMaster,
Name: "network_policy_event_latency_seconds",
Help: "The latency of full network policy event handling (create, delete)",
Buckets: prometheus.ExponentialBuckets(.004, 2, 15)},
[]string{
"event",
})

var metricNetpolLocalPodEventLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: MetricOvnkubeNamespace,
Subsystem: MetricOvnkubeSubsystemMaster,
Name: "network_policy_local_pod_event_latency_seconds",
Help: "The latency of local pod events handling (add, delete)",
Buckets: prometheus.ExponentialBuckets(.002, 2, 15)},
[]string{
"event",
})

var metricNetpolPeerPodEventLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: MetricOvnkubeNamespace,
Subsystem: MetricOvnkubeSubsystemMaster,
Name: "network_policy_peer_pod_event_latency_seconds",
Help: "The latency of peer pod events handling (add, delete)",
Buckets: prometheus.ExponentialBuckets(.002, 2, 15)},
[]string{
"event",
})

var metricNetpolPeerNamespaceEventLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: MetricOvnkubeNamespace,
Subsystem: MetricOvnkubeSubsystemMaster,
Name: "network_policy_peer_namespace_event_latency_seconds",
Help: "The latency of peer namespace events handling (add, delete)",
Buckets: prometheus.ExponentialBuckets(.002, 2, 15)},
[]string{
"event",
})

var metricNetpolPeerNamespaceAndPodEventLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: MetricOvnkubeNamespace,
Subsystem: MetricOvnkubeSubsystemMaster,
Name: "network_policy_peer_namespace_and_pod_event_latency_seconds",
Help: "The latency of peer namespace events handling (add, delete)",
Buckets: prometheus.ExponentialBuckets(.002, 2, 15)},
[]string{
"event",
})

var metricPodEventLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: MetricOvnkubeNamespace,
Subsystem: MetricOvnkubeSubsystemMaster,
Name: "pod_event_latency_seconds",
Help: "The latency of pod events handling (add, update, delete)",
Buckets: prometheus.ExponentialBuckets(.002, 2, 15)},
[]string{
"event",
})

var metricEgressFirewallRuleCount = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: MetricOvnkubeNamespace,
Subsystem: MetricOvnkubeSubsystemMaster,
Expand Down Expand Up @@ -288,6 +359,7 @@ const (
func RegisterMasterBase() {
prometheus.MustRegister(MetricMasterLeader)
prometheus.MustRegister(MetricMasterReadyDuration)
prometheus.MustRegister(MetricMasterSyncDuration)
prometheus.MustRegister(prometheus.NewGaugeFunc(
prometheus.GaugeOpts{
Namespace: MetricOvnkubeNamespace,
Expand Down Expand Up @@ -341,9 +413,16 @@ func RegisterMasterPerformance(nbClient libovsdbclient.Client) {
func RegisterMasterFunctional() {
// No need to unregister because process exits when leadership is lost.
prometheus.MustRegister(metricEgressIPCount)
if config.Metrics.EnableEIPScaleMetrics {
if config.Metrics.EnableScaleMetrics {
klog.Infof("Scale metrics are enabled")
prometheus.MustRegister(metricEgressIPAssignLatency)
prometheus.MustRegister(metricEgressIPUnassignLatency)
prometheus.MustRegister(metricNetpolEventLatency)
prometheus.MustRegister(metricNetpolLocalPodEventLatency)
prometheus.MustRegister(metricNetpolPeerPodEventLatency)
prometheus.MustRegister(metricNetpolPeerNamespaceEventLatency)
prometheus.MustRegister(metricNetpolPeerNamespaceAndPodEventLatency)
prometheus.MustRegister(metricPodEventLatency)
}
prometheus.MustRegister(metricEgressIPNodeUnreacheableCount)
prometheus.MustRegister(metricEgressIPRebalanceCount)
Expand Down Expand Up @@ -448,6 +527,30 @@ func RecordEgressIPRebalance(count int) {
metricEgressIPRebalanceCount.Add(float64(count))
}

func RecordNetpolEvent(eventName string, duration time.Duration) {
metricNetpolEventLatency.WithLabelValues(eventName).Observe(duration.Seconds())
}

func RecordNetpolLocalPodEvent(eventName string, duration time.Duration) {
metricNetpolLocalPodEventLatency.WithLabelValues(eventName).Observe(duration.Seconds())
}

func RecordNetpolPeerPodEvent(eventName string, duration time.Duration) {
metricNetpolPeerPodEventLatency.WithLabelValues(eventName).Observe(duration.Seconds())
}

func RecordNetpolPeerNamespaceEvent(eventName string, duration time.Duration) {
metricNetpolPeerNamespaceEventLatency.WithLabelValues(eventName).Observe(duration.Seconds())
}

func RecordNetpolPeerNamespaceAndPodEvent(eventName string, duration time.Duration) {
metricNetpolPeerNamespaceAndPodEventLatency.WithLabelValues(eventName).Observe(duration.Seconds())
}

func RecordPodEvent(eventName string, duration time.Duration) {
metricPodEventLatency.WithLabelValues(eventName).Observe(duration.Seconds())
}

// UpdateEgressFirewallRuleCount records the number of Egress firewall rules.
func UpdateEgressFirewallRuleCount(count float64) {
metricEgressFirewallRuleCount.Add(count)
Expand Down