diff --git a/apis/hive/v1/metricsconfig/metrics_config.go b/apis/hive/v1/metricsconfig/metrics_config.go index 7a85e3e00d5..8477b03eb14 100644 --- a/apis/hive/v1/metricsconfig/metrics_config.go +++ b/apis/hive/v1/metricsconfig/metrics_config.go @@ -17,4 +17,5 @@ type MetricsConfig struct { // pkg/controller/metrics/metrics_with_dynamic_labels.go // +optional AdditionalClusterDeploymentLabels *map[string]string `json:"additionalClusterDeploymentLabels,omitempty"` + MetricsToReport []MetricsToReport `json:"metricsToReport,omitempty"` } diff --git a/apis/hive/v1/metricsconfig/metrics_to_report.go b/apis/hive/v1/metricsconfig/metrics_to_report.go new file mode 100644 index 00000000000..5aa2542766d --- /dev/null +++ b/apis/hive/v1/metricsconfig/metrics_to_report.go @@ -0,0 +1,15 @@ +package metricsconfig + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// MetricsToReport represents metrics that have additional customizations +type MetricsToReport struct { + // MetricNames is a list of metrics for which the following customizations must be added, if they support the customization + // The name of the metric here must be valid, and it can only be present once in metricsToReport. + MetricNames []string `json:"metricNames"` + // ClusterDeploymentLabelSelector can be used to match cluster deployment label present, it can be used to filter the metrics reported. + // It can only be used with metrics that have their clusterdeployment at hand when they are being reported. + ClusterDeploymentLabelSelector metav1.LabelSelector `json:"clusterDeploymentLabelSelector"` +} diff --git a/apis/hive/v1/metricsconfig/zz_generated.deepcopy.go b/apis/hive/v1/metricsconfig/zz_generated.deepcopy.go index 2f657023dcb..f4ce4f77e23 100644 --- a/apis/hive/v1/metricsconfig/zz_generated.deepcopy.go +++ b/apis/hive/v1/metricsconfig/zz_generated.deepcopy.go @@ -30,6 +30,13 @@ func (in *MetricsConfig) DeepCopyInto(out *MetricsConfig) { } } } + if in.MetricsToReport != nil { + in, out := &in.MetricsToReport, &out.MetricsToReport + *out = make([]MetricsToReport, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } return } @@ -43,6 +50,28 @@ func (in *MetricsConfig) DeepCopy() *MetricsConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MetricsToReport) DeepCopyInto(out *MetricsToReport) { + *out = *in + if in.MetricNames != nil { + in, out := &in.MetricNames, &out.MetricNames + *out = make([]string, len(*in)) + copy(*out, *in) + } + in.ClusterDeploymentLabelSelector.DeepCopyInto(&out.ClusterDeploymentLabelSelector) + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MetricsToReport. +func (in *MetricsToReport) DeepCopy() *MetricsToReport { + if in == nil { + return nil + } + out := new(MetricsToReport) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *MetricsWithDuration) DeepCopyInto(out *MetricsWithDuration) { *out = *in diff --git a/config/crds/hive.openshift.io_hiveconfigs.yaml b/config/crds/hive.openshift.io_hiveconfigs.yaml index 143a35aa377..d206c07c82c 100644 --- a/config/crds/hive.openshift.io_hiveconfigs.yaml +++ b/config/crds/hive.openshift.io_hiveconfigs.yaml @@ -859,6 +859,72 @@ spec: Affected metrics are those whose type implements the metricsWithDynamicLabels interface found in pkg/controller/metrics/metrics_with_dynamic_labels.go type: object + metricsToReport: + items: + description: MetricsToReport represents metrics that have additional + customizations + properties: + clusterDeploymentLabelSelector: + description: |- + ClusterDeploymentLabelSelector can be used to match cluster deployment label present, it can be used to filter the metrics reported. + It can only be used with metrics that have their clusterdeployment at hand when they are being reported. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + metricNames: + description: "\tMetricNames is a list of metrics for which + the following customizations must be added, if they support + the customization\nThe name of the metric here must be + valid, and it can only be present once in metricsToReport." + items: + type: string + type: array + required: + - clusterDeploymentLabelSelector + - metricNames + type: object + type: array metricsWithDuration: description: Optional metrics and their configurations items: diff --git a/docs/hive_metrics.md b/docs/hive_metrics.md index 6eb508f18eb..7ef4894ce83 100644 --- a/docs/hive_metrics.md +++ b/docs/hive_metrics.md @@ -43,111 +43,148 @@ The hive operator will panic if provided additional labels overlap with the fixe Note: It is up to the cluster admins to be mindful of cardinality and ensure these labels are not too specific, like cluster id, otherwise it can negatively impact your observability system's performance +#### Cluster Deployment Label Selector + +Some metrics support the use of [LabelSelector](https://pkg.go.dev/k8s.io/apimachinery/pkg/apis/meta/v1#LabelSelector). If the metric matches the provided label selector, it will be reported, otherwise not. +The label query for the label selector is matched with that of the cluster deployment, so only those metrics which have a related cluster deployment available while observing the metric support this feature. + +Opt into them via `HiveConfig.Spec.MetricsConfig.MetricsToReport.ClusterDeploymentLabelSelector`. + +Example: + +```yaml +hiveConfig: + spec: + metricsConfig: + metricsToReport: + - metricNames: + - hive_foo_histogram + - hive_foo_gauge + clusterDeploymentLabelSelector: + matchLabels: + hive.openshift.io/aro-snowflake: "true" + matchExpressions: + - key: hive.openshift.io/limited-support + operator: NotIn + values: + - "true" + - key: hive.openshift.io/limited-support + operator: NotExists + - metricNames: + - hive_foo_counter + clusterDeploymentLabelSelector: + matchExpressions: + - key: hive.openshift.io/limited-support + operator: NotExists +``` + +Please note: All metric names must be valid, must support this feature and there cannot be duplicate entries for a metric in `metricsToReport`. Please refer to the list of supported metrics Please refer to the fixed labels of each metric [here](#list-of-all-hive-metrics). + ### List of all Hive metrics #### Hive Operator metrics These metrics are observed by the Hive Operator. None of these are optional. -| Metric Name | Optional Label Support | Fixed Labels | -|:-------------------------------:|:----------------------:|---------------------------| -| hive_hiveconfig_conditions | N | {"condition", "reason"} | -| hive_operator_reconcile_seconds | N | {"controller", "outcome"} | +| Metric Name | Optional Label Support | Cluster Deployment Label Selector | Fixed Labels | +|:-------------------------------:|:----------------------:|:---------------------------------:|---------------------------| +| hive_hiveconfig_conditions | N | N | {"condition", "reason"} | +| hive_operator_reconcile_seconds | N | N | {"controller", "outcome"} | #### Metrics reported by all controllers These metrics are observed by all Hive Controllers. None of these are optional. -| Metric Name | Optional Label Support | Fixed Labels | -|:-----------------------------------------:|:----------------------:|----------------------------------------------------------| -| hive_kube_client_requests_total | N | {"controller", "method", "resource", "remote", "status"} | -| hive_kube_client_request_seconds | N | {"controller", "method", "resource", "remote", "status"} | -| hive_kube_client_requests_cancelled_total | N | {"controller", "method", "resource", "remote"} | +| Metric Name | Optional Label Support | Cluster Deployment Label Selector | Fixed Labels | +|:-----------------------------------------:|:----------------------:|:---------------------------------:|----------------------------------------------------------| +| hive_kube_client_requests_total | N | N | {"controller", "method", "resource", "remote", "status"} | +| hive_kube_client_request_seconds | N | N | {"controller", "method", "resource", "remote", "status"} | +| hive_kube_client_requests_cancelled_total | N | N | {"controller", "method", "resource", "remote"} | #### ClusterDeployment controller metrics These metrics are observed while processing ClusterDeployments. None of these are optional. -| Metric Name | Optional Label Support | Fixed Labels | -|:--------------------------------------------------------:|:----------------------:|--------------------------------------------------| -| hive_cluster_deployment_install_job_duration_seconds | N | {} | -| hive_cluster_deployment_install_job_delay_seconds | N | {} | -| hive_cluster_deployment_imageset_job_delay_seconds | N | {} | -| hive_cluster_deployment_dns_delay_seconds | N | {} | -| hive_cluster_deployment_completed_install_restart | Y | {} | -| hive_cluster_deployments_created_total | Y | {} | -| hive_cluster_deployments_installed_total | Y | {} | -| hive_cluster_deployments_deleted_total | Y | {} | -| hive_cluster_deployments_provision_failed_terminal_total | Y | {"clusterpool_namespacedname", "failure_reason"} | +| Metric Name | Optional Label Support | Cluster Deployment Label Selector | Fixed Labels | +|:--------------------------------------------------------:|:----------------------:|:---------------------------------:|--------------------------------------------------| +| hive_cluster_deployment_install_job_duration_seconds | N | Y | {} | +| hive_cluster_deployment_install_job_delay_seconds | N | Y | {} | +| hive_cluster_deployment_imageset_job_delay_seconds | N | Y | {} | +| hive_cluster_deployment_dns_delay_seconds | N | Y | {} | +| hive_cluster_deployment_completed_install_restart | Y | Y | {} | +| hive_cluster_deployments_created_total | Y | Y | {} | +| hive_cluster_deployments_installed_total | Y | Y | {} | +| hive_cluster_deployments_deleted_total | Y | Y | {} | +| hive_cluster_deployments_provision_failed_terminal_total | Y | Y | {"clusterpool_namespacedname", "failure_reason"} | #### ClusterProvision controller metrics These metrics are observed while processing ClusterProvisions. None of these are optional. -| Metric Name | Optional Label Support | Fixed Labels | -|:---------------------------------------------:|:----------------------:|-------------------------------------------------------------------------| -| hive_cluster_provision_results_total | Y | {"result"} | -| hive_install_errors | Y | {"reason"} | -| hive_cluster_deployment_install_failure_total | Y | {"platform", "region", "cluster_version", "workers", "install_attempt"} | -| hive_cluster_deployment_install_success_total | Y | {"platform", "region", "cluster_version", "workers", "install_attempt"} | +| Metric Name | Optional Label Support | Cluster Deployment Label Selector | Fixed Labels | +|:---------------------------------------------:|:----------------------:|:---------------------------------:|-------------------------------------------------------------------------| +| hive_cluster_provision_results_total | Y | N | {"result"} | +| hive_install_errors | Y | N | {"reason"} | +| hive_cluster_deployment_install_failure_total | Y | Y | {"platform", "region", "cluster_version", "workers", "install_attempt"} | +| hive_cluster_deployment_install_success_total | Y | Y | {"platform", "region", "cluster_version", "workers", "install_attempt"} | #### ClusterDeprovision controller metrics These metrics are observed while processing ClusterDeprovisions. None of these are optional. -| Metric Name | Optional Label Support | Fixed Labels | -|:------------------------------------------------------:|:----------------------:|--------------| -| hive_cluster_deployment_uninstall_job_duration_seconds | N | {} | +| Metric Name | Optional Label Support | Cluster Deployment Label Selector | Fixed Labels | +|:------------------------------------------------------:|:----------------------:|:---------------------------------:|--------------| +| hive_cluster_deployment_uninstall_job_duration_seconds | N | Y | {} | #### ClusterPool controller metrics These metrics are observed while processing ClusterPools. None of these are optional. -| Metric Name | Optional Label Support | Fixed Labels | -|:-------------------------------------------------:|:----------------------:|-----------------------------------------------| -| hive_clusterpool_clusterdeployments_assignable | N | {"clusterpool_namespace", "clusterpool_name"} | -| hive_clusterpool_clusterdeployments_claimed | N | {"clusterpool_namespace", "clusterpool_name"} | -| hive_clusterpool_clusterdeployments_deleting | N | {"clusterpool_namespace", "clusterpool_name"} | -| hive_clusterpool_clusterdeployments_installing | N | {"clusterpool_namespace", "clusterpool_name"} | -| hive_clusterpool_clusterdeployments_unclaimed | N | {"clusterpool_namespace", "clusterpool_name"} | -| hive_clusterpool_clusterdeployments_standby | N | {"clusterpool_namespace", "clusterpool_name"} | -| hive_clusterpool_clusterdeployments_stale | N | {"clusterpool_namespace", "clusterpool_name"} | -| hive_clusterpool_clusterdeployments_broken | N | {"clusterpool_namespace", "clusterpool_name"} | -| hive_clusterpool_stale_clusterdeployments_deleted | N | {"clusterpool_namespace", "clusterpool_name"} | -| hive_clusterclaim_assignment_delay_seconds | N | {"clusterpool_namespace", "clusterpool_name"} | +| Metric Name | Optional Label Support | Cluster Deployment Label Selector | Fixed Labels | +|:-------------------------------------------------:|:----------------------:|:---------------------------------:|-----------------------------------------------| +| hive_clusterpool_clusterdeployments_assignable | N | N | {"clusterpool_namespace", "clusterpool_name"} | +| hive_clusterpool_clusterdeployments_claimed | N | N | {"clusterpool_namespace", "clusterpool_name"} | +| hive_clusterpool_clusterdeployments_deleting | N | N | {"clusterpool_namespace", "clusterpool_name"} | +| hive_clusterpool_clusterdeployments_installing | N | N | {"clusterpool_namespace", "clusterpool_name"} | +| hive_clusterpool_clusterdeployments_unclaimed | N | N | {"clusterpool_namespace", "clusterpool_name"} | +| hive_clusterpool_clusterdeployments_standby | N | N | {"clusterpool_namespace", "clusterpool_name"} | +| hive_clusterpool_clusterdeployments_stale | N | N | {"clusterpool_namespace", "clusterpool_name"} | +| hive_clusterpool_clusterdeployments_broken | N | N | {"clusterpool_namespace", "clusterpool_name"} | +| hive_clusterpool_stale_clusterdeployments_deleted | N | N | {"clusterpool_namespace", "clusterpool_name"} | +| hive_clusterclaim_assignment_delay_seconds | N | N | {"clusterpool_namespace", "clusterpool_name"} | #### Metrics controller metrics These metrics are accumulated across all instance of that type. Some of these metrics are optional and the admin can opt for logging them via `HiveConfig.Spec.MetricsConfig.MetricsWithDuration` -| Metric Name | Optional Label Support | Optional | Fixed Labels | -|:--------------------------------------------------------------:|:----------------------:|:--------:|-----------------------------------------------------------------------------------------------------------------| -| hive_cluster_deployments | N | N | {"cluster_type", "age_lt", "power_state"} | -| hive_cluster_deployments_installed | N | N | {"cluster_type", "age_lt"} | -| hive_cluster_deployments_uninstalled | N | N | {"cluster_type", "age_lt", "uninstalled_gt"} | -| hive_cluster_deployments_deprovisioning | N | N | {"cluster_type", "age_lt", "deprovisioning_gt"} | -| hive_cluster_deployments_conditions | N | N | {"cluster_type", "age_lt", "condition"} | -| hive_install_jobs | N | N | {"cluster_type", "state"} | -| hive_uninstall_jobs | N | N | {"cluster_type", "state"} | -| hive_imageset_jobs | N | N | {"cluster_type", "state"} | -| hive_selectorsyncset_clusters_total | N | N | {"name"} | -| hive_selectorsyncset_clusters_unapplied_total | N | N | {"name"} | -| hive_syncsets_total | N | N | {} | -| hive_syncsets_unapplied_total | N | N | {} | -| hive_cluster_deployment_deprovision_underway_seconds | N | N | {"cluster_deployment", "namespace", "cluster_type"} | -| hive_clustersync_failing_seconds | Y | Y | {"namespaced_name", "unreachable"} | -| hive_cluster_deployments_hibernation_transition_seconds | N | Y | {"cluster_version", "platform", "cluster_pool_namespace", "cluster_pool_name"} | -| hive_cluster_deployments_running_transition_seconds | N | Y | {"cluster_version", "platform", "cluster_pool_namespace", "cluster_pool_name"} | -| hive_cluster_deployments_stopping_seconds | N | Y | {"cluster_deployment_namespace", "cluster_deployment", "platform", "cluster_version", "cluster_pool_namespace"} | -| hive_cluster_deployments_resuming_seconds | N | Y | {"cluster_deployment_namespace", "cluster_deployment", "platform", "cluster_version", "cluster_pool_namespace"} | -| hive_cluster_deployments_waiting_for_cluster_operators_seconds | N | Y | {"cluster_deployment_namespace", "cluster_deployment", "platform", "cluster_version", "cluster_pool_namespace"} | -| hive_controller_reconcile_seconds | N | N | {"controller", "outcome"} | -| hive_cluster_deployment_syncset_paused | N | N | {"cluster_deployment", "namespace", "cluster_type"} | -| hive_cluster_deployment_provision_underway_seconds | N | N | {"cluster_deployment", "namespace", "cluster_type", "condition", "reason", "platform", "image_set"} | -| hive_cluster_deployment_provision_underway_install_restarts | N | N | {"cluster_deployment", "namespace", "cluster_type", "condition", "reason", "platform", "image_set"} | +| Metric Name | Optional Label Support | Optional | Cluster Deployment Label Selector | Fixed Labels | +|:--------------------------------------------------------------:|:----------------------:|:--------:|:---------------------------------:|-----------------------------------------------------------------------------------------------------------------| +| hive_cluster_deployments | N | N | Y | {"cluster_type", "age_lt", "power_state"} | +| hive_cluster_deployments_installed | N | N | Y | {"cluster_type", "age_lt"} | +| hive_cluster_deployments_uninstalled | N | N | Y | {"cluster_type", "age_lt", "uninstalled_gt"} | +| hive_cluster_deployments_deprovisioning | N | N | Y | {"cluster_type", "age_lt", "deprovisioning_gt"} | +| hive_cluster_deployments_conditions | N | N | Y | {"cluster_type", "age_lt", "condition"} | +| hive_install_jobs | N | N | N | {"cluster_type", "state"} | +| hive_uninstall_jobs | N | N | N | {"cluster_type", "state"} | +| hive_imageset_jobs | N | N | N | {"cluster_type", "state"} | +| hive_selectorsyncset_clusters_total | N | N | N | {"name"} | +| hive_selectorsyncset_clusters_unapplied_total | N | N | N | {"name"} | +| hive_syncsets_total | N | N | N | {} | +| hive_syncsets_unapplied_total | N | N | N | {} | +| hive_cluster_deployment_deprovision_underway_seconds | N | N | Y | {"cluster_deployment", "namespace", "cluster_type"} | +| hive_clustersync_failing_seconds | Y | Y | Y | {"namespaced_name", "unreachable"} | +| hive_cluster_deployments_hibernation_transition_seconds | N | Y | Y | {"cluster_version", "platform", "cluster_pool_namespace", "cluster_pool_name"} | +| hive_cluster_deployments_running_transition_seconds | N | Y | Y | {"cluster_version", "platform", "cluster_pool_namespace", "cluster_pool_name"} | +| hive_cluster_deployments_stopping_seconds | N | Y | Y | {"cluster_deployment_namespace", "cluster_deployment", "platform", "cluster_version", "cluster_pool_namespace"} | +| hive_cluster_deployments_resuming_seconds | N | Y | Y | {"cluster_deployment_namespace", "cluster_deployment", "platform", "cluster_version", "cluster_pool_namespace"} | +| hive_cluster_deployments_waiting_for_cluster_operators_seconds | N | Y | Y | {"cluster_deployment_namespace", "cluster_deployment", "platform", "cluster_version", "cluster_pool_namespace"} | +| hive_controller_reconcile_seconds | N | N | N | {"controller", "outcome"} | +| hive_cluster_deployment_syncset_paused | N | N | Y | {"cluster_deployment", "namespace", "cluster_type"} | +| hive_cluster_deployment_provision_underway_seconds | N | N | Y | {"cluster_deployment", "namespace", "cluster_type", "condition", "reason", "platform", "image_set"} | +| hive_cluster_deployment_provision_underway_install_restarts | N | N | Y | {"cluster_deployment", "namespace", "cluster_type", "condition", "reason", "platform", "image_set"} | ### Managed DNS Metrics These are specific to the [Managed DNS flow](using-hive.md#managed-dns-1), and are probably interesting only to developers. Not optional. -| Metric Name | Optional Label Support | Fixed Labels | -|:-----------------------------------:|:----------------------:|--------------------| -| hive_managed_dns_scrape_seconds | N | {"managed_domain"} | -| hive_managed_dns_subdomains_scraped | N | {"managed_domain"} | +| Metric Name | Optional Label Support | Cluster Deployment Label Selector | Fixed Labels | +|:-----------------------------------:|:----------------------:|:---------------------------------:|--------------------| +| hive_managed_dns_scrape_seconds | N | N | {"managed_domain"} | +| hive_managed_dns_subdomains_scraped | N | N | {"managed_domain"} | ### Example: Configure metricsConfig diff --git a/hack/app-sre/saas-template.yaml b/hack/app-sre/saas-template.yaml index 05cf62e9e33..ecab4eae301 100644 --- a/hack/app-sre/saas-template.yaml +++ b/hack/app-sre/saas-template.yaml @@ -6466,6 +6466,89 @@ objects: pkg/controller/metrics/metrics_with_dynamic_labels.go' type: object + metricsToReport: + items: + description: MetricsToReport represents metrics that have + additional customizations + properties: + clusterDeploymentLabelSelector: + description: 'ClusterDeploymentLabelSelector can be used + to match cluster deployment label present, it can be + used to filter the metrics reported. + + It can only be used with metrics that have their clusterdeployment + at hand when they are being reported.' + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: 'A label selector requirement is a + selector that contains values, a key, and an operator + that + + relates the key and values.' + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: 'operator represents a key''s relationship + to a set of values. + + Valid operators are In, NotIn, Exists and + DoesNotExist.' + type: string + values: + description: 'values is an array of string values. + If the operator is In or NotIn, + + the values array must be non-empty. If the + operator is Exists or DoesNotExist, + + the values array must be empty. This array + is replaced during a strategic + + merge patch.' + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: 'matchLabels is a map of {key,value} + pairs. A single {key,value} in the matchLabels + + map is equivalent to an element of matchExpressions, + whose key field is "key", the + + operator is "In", and the values array contains + only "value". The requirements are ANDed.' + type: object + type: object + x-kubernetes-map-type: atomic + metricNames: + description: "\tMetricNames is a list of metrics for which\ + \ the following customizations must be added, if they\ + \ support the customization\nThe name of the metric\ + \ here must be valid, and it can only be present once\ + \ in metricsToReport." + items: + type: string + type: array + required: + - clusterDeploymentLabelSelector + - metricNames + type: object + type: array metricsWithDuration: description: Optional metrics and their configurations items: diff --git a/pkg/controller/clusterdeployment/clusterdeployment_controller.go b/pkg/controller/clusterdeployment/clusterdeployment_controller.go index be2d5b2e0cf..651301570ff 100644 --- a/pkg/controller/clusterdeployment/clusterdeployment_controller.go +++ b/pkg/controller/clusterdeployment/clusterdeployment_controller.go @@ -123,15 +123,6 @@ func Add(mgr manager.Manager) error { logger.WithError(err).Error("could not get controller configurations") return err } - // Read the metrics config from hive config and set values for mapClusterTypeLabelToValue, if present - mConfig, err := hivemetrics.ReadMetricsConfig() - if err != nil { - log.WithError(err).Error("error reading metrics config") - return err - } - // Register the metrics. This is done here to ensure we define the metrics with optional label support after we have - // read the hiveconfig, and we register them only once. - registerMetrics(mConfig) return AddToManager(mgr, NewReconciler(mgr, logger, clientRateLimiter), concurrentReconciles, queueRateLimiter) } @@ -609,7 +600,9 @@ func (r *ReconcileClusterDeployment) reconcile(request reconcile.Request, cd *hi cdLog.WithError(err).Log(controllerutils.LogLevel(err), "error adding finalizer") return reconcile.Result{}, err } - metricClustersCreated.Observe(cd, nil, 1) + if hivemetrics.ShouldLogCounterOpts(hivemetrics.MetricClustersCreated.CounterOpts, cd, cdLog) { + hivemetrics.MetricClustersCreated.Observe(cd, nil, 1) + } return reconcile.Result{}, nil } @@ -1070,7 +1063,9 @@ func (r *ReconcileClusterDeployment) resolveInstallerImage(cd *hivev1.ClusterDep // kickstartDuration calculates the delay between creation of cd and start of imageset job kickstartDuration := time.Since(cd.CreationTimestamp.Time) cdLog.WithField("elapsed", kickstartDuration.Seconds()).Info("calculated time to imageset job seconds") - metricImageSetDelaySeconds.Observe(float64(kickstartDuration.Seconds())) + if hivemetrics.ShouldLogHistogramVec(hivemetrics.MetricImageSetDelaySeconds, cd, cdLog) { + hivemetrics.MetricImageSetDelaySeconds.WithLabelValues().Observe(float64(kickstartDuration.Seconds())) + } return &reconcile.Result{}, nil // There was an error getting the job. Return the error. @@ -1507,8 +1502,10 @@ func (r *ReconcileClusterDeployment) removeClusterDeploymentFinalizer(cd *hivev1 return err } - // Increment the clusters deleted counter: - metricClustersDeleted.Observe(cd, nil, 1) + if hivemetrics.ShouldLogCounterOpts(hivemetrics.MetricClustersDeleted.CounterOpts, cd, r.logger) { + // Increment the clusters deleted counter: + hivemetrics.MetricClustersDeleted.Observe(cd, nil, 1) + } return nil } @@ -1582,7 +1579,9 @@ func (r *ReconcileClusterDeployment) setDNSDelayMetric(cd *hivev1.ClusterDeploym return false, err } - metricDNSDelaySeconds.Observe(float64(dnsDelayDuration.Seconds())) + if hivemetrics.ShouldLogHistogramVec(hivemetrics.MetricDNSDelaySeconds, cd, cdLog) { + hivemetrics.MetricDNSDelaySeconds.WithLabelValues().Observe(float64(dnsDelayDuration.Seconds())) + } return true, nil } @@ -1625,7 +1624,7 @@ func (r *ReconcileClusterDeployment) ensureDNSZonePreserveOnDeleteAndLogAnnotati // ensureManagedDNSZone // - Makes sure a DNSZone object exists for this CD, creating it if it does not already exist. // - Parlays the DNSZone's status conditions into the CD's DNSZoneNotReady condition. -// - Observes metricDNSDelaySeconds. +// - Observes MetricDNSDelaySeconds. // Returns: // - bool: true if the caller should return from the reconcile loop, using the... // - Result: suitable for returning from the reconcile loop. If the DNSZone is not ready, it will include a delay to retrigger once @@ -1753,7 +1752,7 @@ func (r *ReconcileClusterDeployment) ensureManagedDNSZone(cd *hivev1.ClusterDepl // Observe ProvisionFailedTerminal metric if we have set ProvisionStopped. if controllerutils.FindCondition(cd.Status.Conditions, hivev1.ProvisionStoppedCondition).Status == corev1.ConditionTrue && provisionStoppedConditionStatus != corev1.ConditionTrue { - incProvisionFailedTerminal(cd) + incProvisionFailedTerminal(cd, cdLog) } // Only attempt to record the delay metric if the DNSZone is ready @@ -2668,3 +2667,22 @@ func LoadReleaseImageVerifier(config *rest.Config) (verify.Interface, error) { } return verify.NewFromManifests([]manifest.Manifest{m}, sigstore.NewCachedHTTPClientConstructor(sigstore.DefaultClient, nil).HTTPClient) } + +func incProvisionFailedTerminal(cd *hivev1.ClusterDeployment, log log.FieldLogger) { + poolNSName := "" + if poolRef := cd.Spec.ClusterPoolRef; poolRef != nil { + poolNSName = poolRef.Namespace + "/" + poolRef.PoolName + } + stoppedReason := "unknown" + stoppedCondition := controllerutils.FindCondition(cd.Status.Conditions, hivev1.ProvisionStoppedCondition) + if stoppedCondition != nil { + stoppedReason = stoppedCondition.Reason + } + fixedLabels := map[string]string{ + "clusterpool_namespacedname": poolNSName, + "failure_reason": stoppedReason, + } + if hivemetrics.ShouldLogCounterOpts(hivemetrics.MetricProvisionFailedTerminal.CounterOpts, cd, log) { + hivemetrics.MetricProvisionFailedTerminal.Observe(cd, fixedLabels, 1) + } +} diff --git a/pkg/controller/clusterdeployment/clusterdeployment_controller_test.go b/pkg/controller/clusterdeployment/clusterdeployment_controller_test.go index 4f650a0d89c..1a6b8845aa8 100644 --- a/pkg/controller/clusterdeployment/clusterdeployment_controller_test.go +++ b/pkg/controller/clusterdeployment/clusterdeployment_controller_test.go @@ -19,7 +19,6 @@ import ( "github.com/openshift/hive/apis/hive/v1/azure" "github.com/openshift/hive/apis/hive/v1/baremetal" "github.com/openshift/hive/apis/hive/v1/gcp" - "github.com/openshift/hive/apis/hive/v1/metricsconfig" hiveintv1alpha1 "github.com/openshift/hive/apis/hiveinternal/v1alpha1" "github.com/openshift/hive/pkg/constants" controllerutils "github.com/openshift/hive/pkg/controller/utils" @@ -95,7 +94,6 @@ func init() { // While the metrics need not be registered for this test suite, they still need to be defined to avoid panics // during the tests var _ log.FieldLogger = log.WithField("controller", "clusterDeployment") - registerMetrics(&metricsconfig.MetricsConfig{}) } func fakeReadFile(content string) func(string) ([]byte, error) { diff --git a/pkg/controller/clusterdeployment/clusterinstalls.go b/pkg/controller/clusterdeployment/clusterinstalls.go index 9c4aa674782..6645cf627fd 100644 --- a/pkg/controller/clusterdeployment/clusterinstalls.go +++ b/pkg/controller/clusterdeployment/clusterinstalls.go @@ -20,6 +20,7 @@ import ( hivev1 "github.com/openshift/hive/apis/hive/v1" hivecontractsv1alpha1 "github.com/openshift/hive/apis/hivecontracts/v1alpha1" "github.com/openshift/hive/pkg/constants" + "github.com/openshift/hive/pkg/controller/metrics" controllerutils "github.com/openshift/hive/pkg/controller/utils" ) @@ -123,7 +124,9 @@ func (r *ReconcileClusterDeployment) reconcileExistingInstallingClusterInstall(c kickstartDuration := time.Since(ci.CreationTimestamp.Time) logger.WithField("elapsed", kickstartDuration.Seconds()).Info("calculated time to first provision seconds") - metricInstallDelaySeconds.Observe(float64(kickstartDuration.Seconds())) + if metrics.ShouldLogHistogramVec(metrics.MetricInstallDelaySeconds, cd, logger) { + metrics.MetricInstallDelaySeconds.WithLabelValues().Observe(float64(kickstartDuration.Seconds())) + } } } @@ -140,7 +143,7 @@ func (r *ReconcileClusterDeployment) reconcileExistingInstallingClusterInstall(c msg = "Install attempts limit reached" } - // Fun extra variable to keep track of whether we should increment metricProvisionFailedTerminal + // Fun extra variable to keep track of whether we should increment MetricProvisionFailedTerminal // later; because we only want to do that if (we change that status and) the status update succeeds. provisionFailedTerminal := false conditions, updated = controllerutils.SetClusterDeploymentConditionWithChangeCheck(conditions, @@ -217,11 +220,17 @@ func (r *ReconcileClusterDeployment) reconcileExistingInstallingClusterInstall(c } installDuration := cd.Status.InstalledTimestamp.Sub(installStartTime.Time) logger.WithField("duration", installDuration.Seconds()).Debug("install job completed") - metricInstallJobDuration.Observe(float64(installDuration.Seconds())) + if metrics.ShouldLogHistogramVec(metrics.MetricInstallJobDuration, cd, logger) { + metrics.MetricInstallJobDuration.WithLabelValues().Observe(float64(installDuration.Seconds())) + } - metricCompletedInstallJobRestarts.Observe(cd, nil, float64(cd.Status.InstallRestarts)) + if metrics.ShouldLogHistogramOpts(metrics.MetricCompletedInstallJobRestarts.HistogramOpts, cd, logger) { + metrics.MetricCompletedInstallJobRestarts.Observe(cd, nil, float64(cd.Status.InstallRestarts)) + } - metricClustersInstalled.Observe(cd, nil, 1) + if metrics.ShouldLogCounterOpts(metrics.MetricClustersInstalled.CounterOpts, cd, logger) { + metrics.MetricClustersInstalled.Observe(cd, nil, 1) + } if r.protectedDelete { // Set protected delete on for the ClusterDeployment. @@ -283,7 +292,7 @@ func (r *ReconcileClusterDeployment) reconcileExistingInstallingClusterInstall(c cd.Spec = *specSave // If we declared the provision terminally failed, bump our metric if provisionFailedTerminal { - incProvisionFailedTerminal(cd) + incProvisionFailedTerminal(cd, logger) } } // Do the spec update after the status update. Otherwise, if the former succeeded but the diff --git a/pkg/controller/clusterdeployment/clusterprovisions.go b/pkg/controller/clusterdeployment/clusterprovisions.go index 6b65bcf7e19..d661561a161 100644 --- a/pkg/controller/clusterdeployment/clusterprovisions.go +++ b/pkg/controller/clusterdeployment/clusterprovisions.go @@ -33,6 +33,7 @@ import ( "github.com/openshift/hive/apis/hive/v1/azure" "github.com/openshift/hive/apis/hive/v1/gcp" "github.com/openshift/hive/pkg/constants" + "github.com/openshift/hive/pkg/controller/metrics" controllerutils "github.com/openshift/hive/pkg/controller/utils" "github.com/openshift/hive/pkg/install" k8slabels "github.com/openshift/hive/pkg/util/labels" @@ -97,7 +98,7 @@ func (r *ReconcileClusterDeployment) startNewProvision( logger.WithError(err).Log(controllerutils.LogLevel(err), "failed to update cluster deployment status") return reconcile.Result{}, err } - incProvisionFailedTerminal(cd) + incProvisionFailedTerminal(cd, logger) } return reconcile.Result{}, nil } @@ -242,7 +243,9 @@ func (r *ReconcileClusterDeployment) startNewProvision( if cd.Status.InstallRestarts == 0 { kickstartDuration := time.Since(cd.CreationTimestamp.Time) logger.WithField("elapsed", kickstartDuration.Seconds()).Info("calculated time to first provision seconds") - metricInstallDelaySeconds.Observe(float64(kickstartDuration.Seconds())) + if metrics.ShouldLogHistogramVec(metrics.MetricInstallDelaySeconds, cd, logger) { + metrics.MetricInstallDelaySeconds.WithLabelValues().Observe(float64(kickstartDuration.Seconds())) + } } return reconcile.Result{}, nil @@ -629,12 +632,18 @@ func (r *ReconcileClusterDeployment) reconcileCompletedProvision(cd *hivev1.Clus } jobDuration := time.Since(startTime.Time) cdLog.WithField("duration", jobDuration.Seconds()).Debug("install job completed") - metricInstallJobDuration.Observe(float64(jobDuration.Seconds())) + if metrics.ShouldLogHistogramVec(metrics.MetricInstallJobDuration, cd, cdLog) { + metrics.MetricInstallJobDuration.WithLabelValues().Observe(float64(jobDuration.Seconds())) + } - // Report a metric for the total number of install restarts: - metricCompletedInstallJobRestarts.Observe(cd, nil, float64(cd.Status.InstallRestarts)) + if metrics.ShouldLogHistogramOpts(metrics.MetricCompletedInstallJobRestarts.HistogramOpts, cd, cdLog) { + // Report a metric for the total number of install restarts: + metrics.MetricCompletedInstallJobRestarts.Observe(cd, nil, float64(cd.Status.InstallRestarts)) + } - metricClustersInstalled.Observe(cd, nil, 1) + if metrics.ShouldLogCounterOpts(metrics.MetricClustersInstalled.CounterOpts, cd, cdLog) { + metrics.MetricClustersInstalled.Observe(cd, nil, 1) + } return reconcile.Result{}, nil } diff --git a/pkg/controller/clusterdeployment/metrics.go b/pkg/controller/clusterdeployment/metrics.go deleted file mode 100644 index e8d42dbc916..00000000000 --- a/pkg/controller/clusterdeployment/metrics.go +++ /dev/null @@ -1,127 +0,0 @@ -package clusterdeployment - -import ( - "github.com/prometheus/client_golang/prometheus" - - "sigs.k8s.io/controller-runtime/pkg/metrics" - - hivev1 "github.com/openshift/hive/apis/hive/v1" - "github.com/openshift/hive/apis/hive/v1/metricsconfig" - hivemetrics "github.com/openshift/hive/pkg/controller/metrics" - controllerutils "github.com/openshift/hive/pkg/controller/utils" -) - -var ( - metricInstallJobDuration = prometheus.NewHistogram( - prometheus.HistogramOpts{ - Name: "hive_cluster_deployment_install_job_duration_seconds", - Help: "Distribution of the runtime of completed install jobs.", - Buckets: []float64{1800, 2400, 3000, 3600, 4500, 5400, 7200}, - }, - ) - metricInstallDelaySeconds = prometheus.NewHistogram( - prometheus.HistogramOpts{ - Name: "hive_cluster_deployment_install_job_delay_seconds", - Help: "Time between cluster deployment creation and creation of the job to install/provision the cluster.", - Buckets: []float64{60, 120, 180, 240, 300, 600, 1200, 1800, 2700, 3600}, - }, - ) - metricImageSetDelaySeconds = prometheus.NewHistogram( - prometheus.HistogramOpts{ - Name: "hive_cluster_deployment_imageset_job_delay_seconds", - Help: "Time between cluster deployment creation and creation of the job which resolves the installer image to use for a ClusterImageSet.", - Buckets: []float64{10, 30, 60, 300, 600, 1200, 1800}, - }, - ) - metricDNSDelaySeconds = prometheus.NewHistogram( - prometheus.HistogramOpts{ - Name: "hive_cluster_deployment_dns_delay_seconds", - Help: "Time between cluster deployment with spec.manageDNS creation and the DNSZone becoming ready.", - Buckets: []float64{10, 30, 60, 300, 600, 1200, 1800}, - }, - ) - - // Declare the metrics which allow optional labels to be added. - // They are defined later once the hive config has been read. - metricCompletedInstallJobRestarts hivemetrics.HistogramVecWithDynamicLabels - - metricClustersCreated hivemetrics.CounterVecWithDynamicLabels - metricClustersInstalled hivemetrics.CounterVecWithDynamicLabels - metricClustersDeleted hivemetrics.CounterVecWithDynamicLabels - metricProvisionFailedTerminal hivemetrics.CounterVecWithDynamicLabels -) - -func incProvisionFailedTerminal(cd *hivev1.ClusterDeployment) { - poolNSName := "" - if poolRef := cd.Spec.ClusterPoolRef; poolRef != nil { - poolNSName = poolRef.Namespace + "/" + poolRef.PoolName - } - stoppedReason := "unknown" - stoppedCondition := controllerutils.FindCondition(cd.Status.Conditions, hivev1.ProvisionStoppedCondition) - if stoppedCondition != nil { - stoppedReason = stoppedCondition.Reason - } - fixedLabels := map[string]string{ - "clusterpool_namespacedname": poolNSName, - "failure_reason": stoppedReason, - } - metricProvisionFailedTerminal.Observe(cd, fixedLabels, 1) -} - -func registerMetrics(mConfig *metricsconfig.MetricsConfig) { - mapClusterTypeLabelToValue := hivemetrics.GetOptionalClusterTypeLabels(mConfig) - - metricCompletedInstallJobRestarts = *hivemetrics.NewHistogramVecWithDynamicLabels( - &prometheus.HistogramOpts{ - Name: "hive_cluster_deployment_completed_install_restart", - Help: "Distribution of the number of restarts for all completed cluster installations.", - Buckets: []float64{0, 2, 10, 20, 50}, - }, - nil, - mapClusterTypeLabelToValue, - ) - metricClustersCreated = *hivemetrics.NewCounterVecWithDynamicLabels( - &prometheus.CounterOpts{ - Name: "hive_cluster_deployments_created_total", - Help: "Counter incremented every time we observe a new cluster.", - }, - nil, - mapClusterTypeLabelToValue, - ) - metricClustersInstalled = *hivemetrics.NewCounterVecWithDynamicLabels( - &prometheus.CounterOpts{ - Name: "hive_cluster_deployments_installed_total", - Help: "Counter incremented every time we observe a successful installation.", - }, - nil, - mapClusterTypeLabelToValue, - ) - metricClustersDeleted = *hivemetrics.NewCounterVecWithDynamicLabels( - &prometheus.CounterOpts{ - Name: "hive_cluster_deployments_deleted_total", - Help: "Counter incremented every time we observe a deleted cluster.", - }, - nil, - mapClusterTypeLabelToValue, - ) - metricProvisionFailedTerminal = *hivemetrics.NewCounterVecWithDynamicLabels( - &prometheus.CounterOpts{ - Name: "hive_cluster_deployments_provision_failed_terminal_total", - Help: "Counter incremented when a cluster provision has failed and won't be retried.", - }, - []string{"clusterpool_namespacedname", "failure_reason"}, - mapClusterTypeLabelToValue, - ) - - metrics.Registry.MustRegister(metricInstallJobDuration) - metrics.Registry.MustRegister(metricInstallDelaySeconds) - metrics.Registry.MustRegister(metricImageSetDelaySeconds) - metrics.Registry.MustRegister(metricDNSDelaySeconds) - - metricProvisionFailedTerminal.Register() - metricCompletedInstallJobRestarts.Register() - metricClustersCreated.Register() - metricClustersInstalled.Register() - metricClustersDeleted.Register() - -} diff --git a/pkg/controller/clusterdeprovision/clusterdeprovision_controller.go b/pkg/controller/clusterdeprovision/clusterdeprovision_controller.go index 2c3c0663579..45fa4861b22 100644 --- a/pkg/controller/clusterdeprovision/clusterdeprovision_controller.go +++ b/pkg/controller/clusterdeprovision/clusterdeprovision_controller.go @@ -7,7 +7,6 @@ import ( "strconv" "strings" - "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" @@ -23,7 +22,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/manager" - "sigs.k8s.io/controller-runtime/pkg/metrics" "sigs.k8s.io/controller-runtime/pkg/reconcile" "sigs.k8s.io/controller-runtime/pkg/source" @@ -43,14 +41,6 @@ const ( ) var ( - metricUninstallJobDuration = prometheus.NewHistogram( - prometheus.HistogramOpts{ - Name: "hive_cluster_deployment_uninstall_job_duration_seconds", - Help: "Distribution of the runtime of completed uninstall jobs.", - Buckets: []float64{60, 300, 600, 1200, 1800, 2400, 3000, 3600}, - }, - ) - // actuators is a list of available actuators for this controller // It is populated via the registerActuator function actuators []Actuator @@ -63,10 +53,6 @@ func registerActuator(a Actuator) { actuators = append(actuators, a) } -func init() { - metrics.Registry.MustRegister(metricUninstallJobDuration) -} - // Add creates a new ClusterDeprovision Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller // and Start it when the Manager is Started. func Add(mgr manager.Manager) error { @@ -373,7 +359,9 @@ func (r *ReconcileClusterDeprovision) Reconcile(ctx context.Context, request rec rLog.WithError(err).Log(controllerutils.LogLevel(err), "error updating request status") return reconcile.Result{}, err } - metricUninstallJobDuration.Observe(float64(jobDuration.Seconds())) + if hivemetrics.ShouldLogHistogramVec(hivemetrics.MetricUninstallJobDuration, cd, rLog) { + hivemetrics.MetricUninstallJobDuration.WithLabelValues().Observe(float64(jobDuration.Seconds())) + } return reconcile.Result{}, nil } diff --git a/pkg/controller/clusterprovision/clusterprovision_controller.go b/pkg/controller/clusterprovision/clusterprovision_controller.go index 5eff98e11dc..21b8fa7b226 100644 --- a/pkg/controller/clusterprovision/clusterprovision_controller.go +++ b/pkg/controller/clusterprovision/clusterprovision_controller.go @@ -626,9 +626,9 @@ func (r *ReconcileClusterProvision) logProvisionSuccessFailureMetric( r.logger.WithError(err).Error("error getting cluster deployment") return } - timeMetric := metricInstallFailureSeconds + timeMetric := hivemetrics.MetricInstallFailureSeconds if stage == hivev1.ClusterProvisionStageComplete { - timeMetric = metricInstallSuccessSeconds + timeMetric = hivemetrics.MetricInstallSuccessSeconds } installVersion := constants.MetricLabelDefaultValue // InstallVersion is set by the imageset job. Can be nil if we never ran that (e.g. minimal install mode). @@ -642,5 +642,7 @@ func (r *ReconcileClusterProvision) logProvisionSuccessFailureMetric( "workers": r.getWorkers(*cd), "install_attempt": strconv.Itoa(instance.Spec.Attempt), } - timeMetric.Observe(cd, fixedLabels, time.Since(instance.CreationTimestamp.Time).Seconds()) + if hivemetrics.ShouldLogHistogramOpts(timeMetric.HistogramOpts, cd, r.logger) { + timeMetric.Observe(cd, fixedLabels, time.Since(instance.CreationTimestamp.Time).Seconds()) + } } diff --git a/pkg/controller/clusterprovision/metrics.go b/pkg/controller/clusterprovision/metrics.go index e4eac8e242e..084a880181f 100644 --- a/pkg/controller/clusterprovision/metrics.go +++ b/pkg/controller/clusterprovision/metrics.go @@ -12,9 +12,6 @@ var ( // They are defined later once the hive config has been read. metricClusterProvisionsTotal hivemetrics.CounterVecWithDynamicLabels metricInstallErrors hivemetrics.CounterVecWithDynamicLabels - - metricInstallFailureSeconds hivemetrics.HistogramVecWithDynamicLabels - metricInstallSuccessSeconds hivemetrics.HistogramVecWithDynamicLabels ) func registerMetrics(mConfig *metricsconfig.MetricsConfig) { @@ -37,27 +34,6 @@ func registerMetrics(mConfig *metricsconfig.MetricsConfig) { mapClusterTypeLabelToValue, ) - metricInstallFailureSeconds = *hivemetrics.NewHistogramVecWithDynamicLabels( - &prometheus.HistogramOpts{ - Name: "hive_cluster_deployment_install_failure_total", - Help: "Time taken before a cluster provision failed to install", - Buckets: []float64{30, 120, 300, 600, 1800}, - }, - []string{"platform", "region", "cluster_version", "workers", "install_attempt"}, - mapClusterTypeLabelToValue, - ) - metricInstallSuccessSeconds = *hivemetrics.NewHistogramVecWithDynamicLabels( - &prometheus.HistogramOpts{ - Name: "hive_cluster_deployment_install_success_total", - Help: "Time taken before a cluster provision succeeded to install", - Buckets: []float64{1800, 2400, 3000, 3600}, - }, - []string{"platform", "region", "cluster_version", "workers", "install_attempt"}, - mapClusterTypeLabelToValue, - ) - metricInstallErrors.Register() metricClusterProvisionsTotal.Register() - metricInstallFailureSeconds.Register() - metricInstallSuccessSeconds.Register() } diff --git a/pkg/controller/hibernation/hibernation_controller.go b/pkg/controller/hibernation/hibernation_controller.go index 67284c0e580..8479122b3fe 100644 --- a/pkg/controller/hibernation/hibernation_controller.go +++ b/pkg/controller/hibernation/hibernation_controller.go @@ -675,7 +675,7 @@ func logCumulativeMetric(metric *prometheus.HistogramVec, cd *hivev1.ClusterDepl return } time := time.Since(condition.LastTransitionTime.Time).Seconds() - if !hivemetrics.ShouldLogHistogramDurationMetric(metric, time) { + if !hivemetrics.ShouldLogHistogramDurationMetric(metric, time) || !hivemetrics.ShouldLogHistogramVec(metric, cd, logger) { return } poolNS, poolName := "", "" diff --git a/pkg/controller/metrics/cluster_deployment_label_selector.go b/pkg/controller/metrics/cluster_deployment_label_selector.go new file mode 100644 index 00000000000..823b5212d12 --- /dev/null +++ b/pkg/controller/metrics/cluster_deployment_label_selector.go @@ -0,0 +1,106 @@ +package metrics + +import ( + "github.com/sirupsen/logrus" + + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + + hivev1 "github.com/openshift/hive/apis/hive/v1" +) + +var ( + // empty interface for the ease of setting values for supportedMetrics + empty interface{} +) + +type clusterDeploymentLabelSelectorMetrics struct { + // supportedMetrics lists all the metrics that hive currently logs, which support clusterDeploymentLabelSelector customization + // It is a map with empty interface set as value, to keep it lightweight and with O(1) look up time. + supportedMetrics map[string]interface{} + // metricsWithLabelSelector will be the map of metrics that have clusterDeploymentLabelSelector customization added in metricsConfig + metricsWithLabelSelector map[string]v1.LabelSelector +} + +func newClusterDeploymentLabelSelectorMetrics() *clusterDeploymentLabelSelectorMetrics { + return &clusterDeploymentLabelSelectorMetrics{ + supportedMetrics: map[string]interface{}{ + // counterOpts + "hive_cluster_deployments_created_total": empty, + "hive_cluster_deployments_installed_total": empty, + "hive_cluster_deployments_deleted_total": empty, + "hive_cluster_deployments_provision_failed_terminal_total": empty, + // gaugeVecs + "hive_cluster_deployments": empty, + "hive_cluster_deployments_installed": empty, + "hive_cluster_deployments_uninstalled": empty, + "hive_cluster_deployments_deprovisioning": empty, + "hive_cluster_deployments_conditions": empty, + "hive_cluster_deployment_syncset_paused": empty, + // histogramVecs + "hive_cluster_deployments_hibernation_transition_seconds": empty, + "hive_cluster_deployments_running_transition_seconds": empty, + "hive_cluster_deployments_stopping_seconds": empty, + "hive_cluster_deployments_resuming_seconds": empty, + "hive_cluster_deployments_waiting_for_cluster_operators_seconds": empty, + "hive_cluster_deployment_install_job_duration_seconds": empty, + "hive_cluster_deployment_install_job_delay_seconds": empty, + "hive_cluster_deployment_imageset_job_delay_seconds": empty, + "hive_cluster_deployment_dns_delay_seconds": empty, + "hive_cluster_deployment_uninstall_job_duration_seconds": empty, + // histogramOpts + "hive_cluster_deployment_completed_install_restart": empty, + "hive_cluster_deployment_install_failure_total": empty, + "hive_cluster_deployment_install_success_total": empty, + // custom metrics + "hive_cluster_deployment_provision_underway_seconds": empty, + "hive_cluster_deployment_provision_underway_install_restarts": empty, + "hive_cluster_deployment_deprovision_underway_seconds": empty, + "hive_clustersync_failing_seconds": empty, + }, + metricsWithLabelSelector: make(map[string]v1.LabelSelector), + } +} + +// isMetricSupported should be used to validate if the metric name is valid and supported clusterDeploymentLabelSelector customization +func (ls *clusterDeploymentLabelSelectorMetrics) isMetricSupported(name string) bool { + if _, ok := ls.supportedMetrics[name]; ok { + return true + } + return false +} + +// hasClusterDeploymentLabelSelector can be used to check if the metric has a related clusterDeploymentLabelSelector configured in HiveConfig +func (ls *clusterDeploymentLabelSelectorMetrics) hasClusterDeploymentLabelSelector(metricName string) bool { + if _, ok := ls.metricsWithLabelSelector[metricName]; ok { + return true + } + return false +} + +// matchesLabelSelector can be used to determine the cluster deployment matches the label selector configured +func (ls *clusterDeploymentLabelSelectorMetrics) matchesLabelSelector(name string, cd *hivev1.ClusterDeployment, log logrus.FieldLogger) bool { + if _, ok := ls.metricsWithLabelSelector[name]; !ok { + return false + } + value := ls.metricsWithLabelSelector[name] + selector, err := v1.LabelSelectorAsSelector(&value) + if err != nil { + log.WithError(err).WithField("metric", name).Errorf("cannot parse clusterDeploymentLabelSelector") + return false + } + if selector.Matches(labels.Set(cd.Labels)) { + return true + } + return false +} + +// shouldLogMetric encapsulates the logic of ClusterDeploymentLabelSelector for the provided metric and the related clusterDeployment and decides if the +// metric must be logged +func (ls *clusterDeploymentLabelSelectorMetrics) shouldLogMetric(name string, cd *hivev1.ClusterDeployment, log logrus.FieldLogger) bool { + // If there is no related clusterDeploymentLabelSelector, log the metric since there's no restriction on reporting it + if !ls.hasClusterDeploymentLabelSelector(name) { + return true + } + return ls.matchesLabelSelector(name, cd, log) +} diff --git a/pkg/controller/metrics/custom_collectors.go b/pkg/controller/metrics/custom_collectors.go index b228614f6cf..73fb1549504 100644 --- a/pkg/controller/metrics/custom_collectors.go +++ b/pkg/controller/metrics/custom_collectors.go @@ -54,7 +54,7 @@ func (cc provisioningUnderwayCollector) Collect(ch chan<- prometheus.Metric) { clusterDeployments := &hivev1.ClusterDeploymentList{} err := cc.client.List(context.Background(), clusterDeployments) if err != nil { - log.WithError(err).Error("error listing cluster deployments") + ccLog.WithError(err).Error("error listing cluster deployments") return } for _, cd := range clusterDeployments.Items { @@ -64,6 +64,9 @@ func (cc provisioningUnderwayCollector) Collect(ch chan<- prometheus.Metric) { if cd.Spec.Installed { continue } + if !ShouldLogCustomMetric(metricClusterDeploymentProvisionUnderwaySecondsDesc, &cd, ccLog) { + continue + } platform := cd.Labels[hivev1.HiveClusterPlatformLabel] imageSet := "none" @@ -143,7 +146,7 @@ func (cc provisioningUnderwayInstallRestartsCollector) Collect(ch chan<- prometh clusterDeployments := &hivev1.ClusterDeploymentList{} err := cc.client.List(context.Background(), clusterDeployments) if err != nil { - log.WithError(err).Error("error listing cluster deployments") + ccLog.WithError(err).Error("error listing cluster deployments") return } for _, cd := range clusterDeployments.Items { @@ -153,6 +156,9 @@ func (cc provisioningUnderwayInstallRestartsCollector) Collect(ch chan<- prometh if cd.Spec.Installed { continue } + if !ShouldLogCustomMetric(provisioningUnderwayInstallRestartsCollectorDesc, &cd, ccLog) { + continue + } platform := cd.Labels[hivev1.HiveClusterPlatformLabel] imageSet := "none" @@ -260,13 +266,16 @@ func (cc deprovisioningUnderwayCollector) Collect(ch chan<- prometheus.Metric) { clusterDeployments := &hivev1.ClusterDeploymentList{} err := cc.client.List(context.Background(), clusterDeployments) if err != nil { - log.WithError(err).Error("error listing cluster deployments") + ccLog.WithError(err).Error("error listing cluster deployments") return } for _, cd := range clusterDeployments.Items { if cd.DeletionTimestamp == nil { continue } + if !ShouldLogCustomMetric(metricClusterDeploymentDeprovisionUnderwaySecondsDesc, &cd, ccLog) { + continue + } elapsedDuration := time.Since(cd.DeletionTimestamp.Time) @@ -279,9 +288,7 @@ func (cc deprovisioningUnderwayCollector) Collect(ch chan<- prometheus.Metric) { cd.Namespace, GetLabelValue(&cd, hivev1.HiveClusterTypeLabel), ) - } - } func (cc deprovisioningUnderwayCollector) Describe(ch chan<- *prometheus.Desc) { @@ -331,14 +338,14 @@ func (cc clusterSyncFailingCollector) Collect(ch chan<- prometheus.Metric) { clusterSyncList := &hiveintv1alpha1.ClusterSyncList{} err := cc.client.List(context.Background(), clusterSyncList) if err != nil { - log.WithError(err).Error("error listing all ClusterSyncs") + ccLog.WithError(err).Error("error listing all ClusterSyncs") return } clusterDeployments := &hivev1.ClusterDeploymentList{} err = cc.client.List(context.Background(), clusterDeployments) if err != nil { - log.WithError(err).Error("error listing cluster deployments") + ccLog.WithError(err).Error("error listing cluster deployments") return } @@ -353,23 +360,25 @@ func (cc clusterSyncFailingCollector) Collect(ch chan<- prometheus.Metric) { break } } - fixedLabels := make(map[string]string, len(cc.dynamicLabels.fixedLabels)) - fixedLabels["namespaced_name"] = cs.Namespace + "/" + cs.Name - if !reflect.ValueOf(cdRef).IsZero() { - if unreachableCondition := controllerutils.FindCondition(cdRef.Status.Conditions, hivev1.UnreachableCondition); unreachableCondition != nil { - fixedLabels["unreachable"] = string(unreachableCondition.Status) + if ShouldLogCustomMetric(metricClusterSyncFailingSeconds, &cdRef, ccLog) { + fixedLabels := make(map[string]string, len(cc.dynamicLabels.fixedLabels)) + fixedLabels["namespaced_name"] = cs.Namespace + "/" + cs.Name + if !reflect.ValueOf(cdRef).IsZero() { + if unreachableCondition := controllerutils.FindCondition(cdRef.Status.Conditions, hivev1.UnreachableCondition); unreachableCondition != nil { + fixedLabels["unreachable"] = string(unreachableCondition.Status) + } + } + labelValues := cc.dynamicLabels.buildLabelSlice(fixedLabels, &cdRef) + seconds := time.Since(cond.LastTransitionTime.Time).Seconds() + // check if duration crosses the threshold + if cc.minDuration.Seconds() <= seconds { + ch <- prometheus.MustNewConstMetric( + cc.metricClusterSyncFailingSeconds, + prometheus.GaugeValue, + seconds, + labelValues..., + ) } - } - labelValues := cc.dynamicLabels.buildLabelSlice(fixedLabels, &cdRef) - seconds := time.Since(cond.LastTransitionTime.Time).Seconds() - // check if duration crosses the threshold - if cc.minDuration.Seconds() <= seconds { - ch <- prometheus.MustNewConstMetric( - cc.metricClusterSyncFailingSeconds, - prometheus.GaugeValue, - seconds, - labelValues..., - ) } } } @@ -379,6 +388,8 @@ func (cc clusterSyncFailingCollector) Describe(ch chan<- *prometheus.Desc) { prometheus.DescribeByCollect(cc, ch) } +var metricClusterSyncFailingSeconds *prometheus.Desc + func newClusterSyncFailingCollector(client client.Client, minimum time.Duration, optionalLabels map[string]string) prometheus.Collector { metricName := "hive_clustersync_failing_seconds" baseLabels := dynamicLabels{ @@ -394,15 +405,16 @@ func newClusterSyncFailingCollector(client client.Client, minimum time.Duration, dynamicLabels: baseLabels, labelList: baseLabels.getLabelList(), } + metricClusterSyncFailingSeconds = prometheus.NewDesc( + metricName, + "Length of time a clustersync has been failing", + labels.labelList, + nil, + ) return clusterSyncFailingCollector{ - client: client, - metricClusterSyncFailingSeconds: prometheus.NewDesc( - metricName, - "Length of time a clustersync has been failing", - labels.labelList, - nil, - ), - minDuration: minimum, - dynamicLabels: labels, + client: client, + metricClusterSyncFailingSeconds: metricClusterSyncFailingSeconds, + minDuration: minimum, + dynamicLabels: labels, } } diff --git a/pkg/controller/metrics/metrics.go b/pkg/controller/metrics/metrics.go index 64773493888..996e374980d 100644 --- a/pkg/controller/metrics/metrics.go +++ b/pkg/controller/metrics/metrics.go @@ -155,6 +155,115 @@ var ( // mapMetricToDurationGauges is a map of optional durationMetrics of type Gauge to their specific duration, if // mentioned mapMetricToDurationGauges map[*prometheus.GaugeVec]time.Duration + + // Metrics reported by ClusterDeployment controller + + MetricInstallJobDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "hive_cluster_deployment_install_job_duration_seconds", + Help: "Distribution of the runtime of completed install jobs.", + Buckets: []float64{1800, 2400, 3000, 3600, 4500, 5400, 7200}, + }, + nil, + ) + MetricInstallDelaySeconds = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "hive_cluster_deployment_install_job_delay_seconds", + Help: "Time between cluster deployment creation and creation of the job to install/provision the cluster.", + Buckets: []float64{60, 120, 180, 240, 300, 600, 1200, 1800, 2700, 3600}, + }, + nil, + ) + MetricImageSetDelaySeconds = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "hive_cluster_deployment_imageset_job_delay_seconds", + Help: "Time between cluster deployment creation and creation of the job which resolves the installer image to use for a ClusterImageSet.", + Buckets: []float64{10, 30, 60, 300, 600, 1200, 1800}, + }, + nil, + ) + MetricDNSDelaySeconds = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "hive_cluster_deployment_dns_delay_seconds", + Help: "Time between cluster deployment with spec.manageDNS creation and the DNSZone becoming ready.", + Buckets: []float64{10, 30, 60, 300, 600, 1200, 1800}, + }, + nil, + ) + // Metrics with additional label support. The dynamic labels will be set when we register these metrics after reading the metricsConfig. + MetricCompletedInstallJobRestarts = *NewHistogramVecWithDynamicLabels( + &prometheus.HistogramOpts{ + Name: "hive_cluster_deployment_completed_install_restart", + Help: "Distribution of the number of restarts for all completed cluster installations.", + Buckets: []float64{0, 2, 10, 20, 50}, + }, + nil, + map[string]string{}, + ) + MetricClustersCreated = *NewCounterVecWithDynamicLabels( + &prometheus.CounterOpts{ + Name: "hive_cluster_deployments_created_total", + Help: "Counter incremented every time we observe a new cluster.", + }, + nil, + map[string]string{}, + ) + MetricClustersInstalled = *NewCounterVecWithDynamicLabels( + &prometheus.CounterOpts{ + Name: "hive_cluster_deployments_installed_total", + Help: "Counter incremented every time we observe a successful installation.", + }, + nil, + map[string]string{}, + ) + MetricClustersDeleted = *NewCounterVecWithDynamicLabels( + &prometheus.CounterOpts{ + Name: "hive_cluster_deployments_deleted_total", + Help: "Counter incremented every time we observe a deleted cluster.", + }, + nil, + map[string]string{}, + ) + MetricProvisionFailedTerminal = *NewCounterVecWithDynamicLabels( + &prometheus.CounterOpts{ + Name: "hive_cluster_deployments_provision_failed_terminal_total", + Help: "Counter incremented when a cluster provision has failed and won't be retried.", + }, + []string{"clusterpool_namespacedname", "failure_reason"}, + map[string]string{}, + ) + + // Metrics reported by ClusterDeprovision controller + + MetricUninstallJobDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "hive_cluster_deployment_uninstall_job_duration_seconds", + Help: "Distribution of the runtime of completed uninstall jobs.", + Buckets: []float64{60, 300, 600, 1200, 1800, 2400, 3000, 3600}, + }, + nil, + ) + + // Some metrics reported by ClusterProvision controller, they support clusterDeploymentLabelSelector + + MetricInstallFailureSeconds = *NewHistogramVecWithDynamicLabels( + &prometheus.HistogramOpts{ + Name: "hive_cluster_deployment_install_failure_total", + Help: "Time taken before a cluster provision failed to install", + Buckets: []float64{30, 120, 300, 600, 1800}, + }, + []string{"platform", "region", "cluster_version", "workers", "install_attempt"}, + map[string]string{}, + ) + MetricInstallSuccessSeconds = *NewHistogramVecWithDynamicLabels( + &prometheus.HistogramOpts{ + Name: "hive_cluster_deployment_install_success_total", + Help: "Time taken before a cluster provision succeeded to install", + Buckets: []float64{1800, 2400, 3000, 3600}, + }, + []string{"platform", "region", "cluster_version", "workers", "install_attempt"}, + map[string]string{}, + ) ) // ReconcileOutcome is used in controller "reconcile complete" log entries, and the metricControllerReconcileTime @@ -186,6 +295,11 @@ func init() { metrics.Registry.MustRegister(metricSyncSetsUnappliedTotal) metrics.Registry.MustRegister(metricControllerReconcileTime) metrics.Registry.MustRegister(metricClusterDeploymentSyncsetPaused) + metrics.Registry.MustRegister(MetricInstallJobDuration) + metrics.Registry.MustRegister(MetricInstallDelaySeconds) + metrics.Registry.MustRegister(MetricImageSetDelaySeconds) + metrics.Registry.MustRegister(MetricDNSDelaySeconds) + metrics.Registry.MustRegister(MetricUninstallJobDuration) } // Add creates a new metrics Calculator and adds it to the Manager. @@ -251,9 +365,9 @@ func (mc *Calculator) Start(ctx context.Context) error { } for _, cd := range clusterDeployments.Items { clusterType := GetLabelValue(&cd, hivev1.HiveClusterTypeLabel) - accumulator.processCluster(&cd) + accumulator.processCluster(&cd, mcLog) - if paused, err := strconv.ParseBool(cd.Annotations[constants.SyncsetPauseAnnotation]); err == nil && paused { + if paused, err := strconv.ParseBool(cd.Annotations[constants.SyncsetPauseAnnotation]); err == nil && paused && ShouldLogGaugeVec(metricClusterDeploymentSyncsetPaused, &cd, mcLog) { metricClusterDeploymentSyncsetPaused.WithLabelValues( cd.Name, cd.Namespace, @@ -308,7 +422,7 @@ func (mc *Calculator) Start(ctx context.Context) error { return } for _, cd := range clusterDeployments.Items { - accumulator.processCluster(&cd) + accumulator.processCluster(&cd, mcLog) } accumulator.setMetrics(metricClusterDeploymentsTotal, @@ -389,6 +503,7 @@ func (mc *Calculator) Start(ctx context.Context) error { func (mc *Calculator) registerOptionalMetrics(mConfig *metricsconfig.MetricsConfig) { mapMetricToDurationHistograms = make(map[*prometheus.HistogramVec]time.Duration) mapMetricToDurationGauges = make(map[*prometheus.GaugeVec]time.Duration) + optionalLabels := GetOptionalClusterTypeLabels(mConfig) for _, metric := range mConfig.MetricsWithDuration { switch metric.Name { // Histograms @@ -409,9 +524,25 @@ func (mc *Calculator) registerOptionalMetrics(mConfig *metricsconfig.MetricsConf mapMetricToDurationHistograms[MetricClusterReadyTransitionSeconds] = metric.Duration.Duration // Gauges case metricsconfig.CurrentClusterSyncFailing: - metrics.Registry.MustRegister(newClusterSyncFailingCollector(mc.Client, metric.Duration.Duration, GetOptionalClusterTypeLabels(mConfig))) + metrics.Registry.MustRegister(newClusterSyncFailingCollector(mc.Client, metric.Duration.Duration, optionalLabels)) } } + // Set dynamic labels for metrics with additional label support and register them + MetricProvisionFailedTerminal.optionalLabels = optionalLabels + MetricCompletedInstallJobRestarts.optionalLabels = optionalLabels + MetricClustersCreated.optionalLabels = optionalLabels + MetricClustersInstalled.optionalLabels = optionalLabels + MetricClustersDeleted.optionalLabels = optionalLabels + MetricInstallFailureSeconds.optionalLabels = optionalLabels + MetricInstallSuccessSeconds.optionalLabels = optionalLabels + + MetricProvisionFailedTerminal.Register() + MetricCompletedInstallJobRestarts.Register() + MetricClustersCreated.Register() + MetricClustersInstalled.Register() + MetricClustersDeleted.Register() + MetricInstallFailureSeconds.Register() + MetricInstallSuccessSeconds.Register() } // ShouldLogHistogramDurationMetric decides whether the corresponding duration metric of type histogram should be logged. @@ -599,7 +730,7 @@ func (ca *clusterAccumulator) ensureClusterTypeBuckets(clusterType string, power } } -func (ca *clusterAccumulator) processCluster(cd *hivev1.ClusterDeployment) { +func (ca *clusterAccumulator) processCluster(cd *hivev1.ClusterDeployment, log log.FieldLogger) { if ca.ageFilter != infinity && time.Since(cd.CreationTimestamp.Time) > ca.ageFilterDur { return } @@ -609,9 +740,11 @@ func (ca *clusterAccumulator) processCluster(cd *hivev1.ClusterDeployment) { ca.ensureClusterTypeBuckets(clusterType, powerState) ca.clusterTypesSet[clusterType] = true - ca.total[powerState][clusterType]++ + if ShouldLogGaugeVec(metricClusterDeploymentsTotal, cd, log) { + ca.total[powerState][clusterType]++ + } - if cd.DeletionTimestamp != nil { + if cd.DeletionTimestamp != nil && ShouldLogGaugeVec(metricClusterDeploymentsDeprovisioningTotal, cd, log) { // Sort deleted clusters into buckets based on how long since // they were deleted. The larger the bucket the more serious the problem. deletedDur := time.Since(cd.DeletionTimestamp.Time) @@ -626,8 +759,10 @@ func (ca *clusterAccumulator) processCluster(cd *hivev1.ClusterDeployment) { } if cd.Spec.Installed { - ca.installed[clusterType]++ - } else { + if ShouldLogGaugeVec(metricClusterDeploymentsInstalledTotal, cd, log) { + ca.installed[clusterType]++ + } + } else if ShouldLogGaugeVec(metricClusterDeploymentsUninstalledTotal, cd, log) { // Sort uninstall clusters into buckets based on how long since // they were created. The larger the bucket the more serious the problem. uninstalledDur := time.Since(cd.CreationTimestamp.Time) @@ -641,10 +776,12 @@ func (ca *clusterAccumulator) processCluster(cd *hivev1.ClusterDeployment) { } } - // Process conditions regardless if installed or not: - for _, cond := range cd.Status.Conditions { - if !controllerutils.IsConditionInDesiredState(cond) { - ca.addConditionToMap(cond.Type, clusterType) + if ShouldLogGaugeVec(metricClusterDeploymentsWithConditionTotal, cd, log) { + // Process conditions regardless if installed or not: + for _, cond := range cd.Status.Conditions { + if !controllerutils.IsConditionInDesiredState(cond) { + ca.addConditionToMap(cond.Type, clusterType) + } } } } diff --git a/pkg/controller/metrics/metrics_test.go b/pkg/controller/metrics/metrics_test.go index 7f9feaf9c68..fddc96ea061 100644 --- a/pkg/controller/metrics/metrics_test.go +++ b/pkg/controller/metrics/metrics_test.go @@ -1,6 +1,7 @@ package metrics import ( + log "github.com/sirupsen/logrus" "testing" "time" @@ -78,7 +79,7 @@ func TestClusterAccumulator(t *testing.T) { accumulator, _ := newClusterAccumulator(infinity, []string{"0h", "1h", "2h", "8h", "24h", "72h"}) for _, cd := range clusters { - accumulator.processCluster(&cd) + accumulator.processCluster(&cd, log.New()) } assert.Equal(t, 12, accumulator.total["unspecified"]["managed"]) @@ -116,7 +117,7 @@ func TestClusterAccumulator(t *testing.T) { // Also test with a cluster age filter: accumulator, _ = newClusterAccumulator("8h", []string{"0h", "1h", "2h", "8h", "24h", "72h"}) for _, cd := range clusters { - accumulator.processCluster(&cd) + accumulator.processCluster(&cd, log.New()) } assert.Equal(t, 7, accumulator.total["unspecified"]["managed"]) assert.Equal(t, 2, accumulator.total["Hibernating"]["managed"]) diff --git a/pkg/controller/metrics/metrics_to_report.go b/pkg/controller/metrics/metrics_to_report.go new file mode 100644 index 00000000000..e01ac4ad125 --- /dev/null +++ b/pkg/controller/metrics/metrics_to_report.go @@ -0,0 +1,110 @@ +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/sirupsen/logrus" + + hivev1 "github.com/openshift/hive/apis/hive/v1" + "github.com/openshift/hive/apis/hive/v1/metricsconfig" +) + +var ( + // Maintain a list of all the metrics with their names for validation and observing purposes + // We have to maintain the name of the metrics separately as a map because of the way metrics are currently declared, their names are not available in the code + counterVecs = map[*prometheus.CounterOpts]string{ + MetricClustersCreated.CounterOpts: "hive_cluster_deployments_created_total", + MetricClustersInstalled.CounterOpts: "hive_cluster_deployments_installed_total", + MetricClustersDeleted.CounterOpts: "hive_cluster_deployments_deleted_total", + MetricProvisionFailedTerminal.CounterOpts: "hive_cluster_deployments_provision_failed_terminal_total", + } + gaugeVecs = map[*prometheus.GaugeVec]string{ + metricClusterDeploymentsTotal: "hive_cluster_deployments", + metricClusterDeploymentsInstalledTotal: "hive_cluster_deployments_installed", + metricClusterDeploymentsUninstalledTotal: "hive_cluster_deployments_uninstalled", + metricClusterDeploymentsDeprovisioningTotal: "hive_cluster_deployments_deprovisioning", + metricClusterDeploymentsWithConditionTotal: "hive_cluster_deployments_conditions", + metricClusterDeploymentSyncsetPaused: "hive_cluster_deployment_syncset_paused", + } + histogramVecs = map[*prometheus.HistogramVec]string{ + MetricClusterHibernationTransitionSeconds: "hive_cluster_deployments_hibernation_transition_seconds", + MetricClusterReadyTransitionSeconds: "hive_cluster_deployments_running_transition_seconds", + MetricStoppingClustersSeconds: "hive_cluster_deployments_stopping_seconds", + MetricResumingClustersSeconds: "hive_cluster_deployments_resuming_seconds", + MetricWaitingForCOClustersSeconds: "hive_cluster_deployments_waiting_for_cluster_operators_seconds", + MetricInstallJobDuration: "hive_cluster_deployment_install_job_duration_seconds", + MetricInstallDelaySeconds: "hive_cluster_deployment_install_job_delay_seconds", + MetricImageSetDelaySeconds: "hive_cluster_deployment_imageset_job_delay_seconds", + MetricDNSDelaySeconds: "hive_cluster_deployment_dns_delay_seconds", + MetricUninstallJobDuration: "hive_cluster_deployment_uninstall_job_duration_seconds", + } + histogramOpts = map[*prometheus.HistogramOpts]string{ + MetricCompletedInstallJobRestarts.HistogramOpts: "hive_cluster_deployment_completed_install_restart", + MetricInstallFailureSeconds.HistogramOpts: "hive_cluster_deployment_install_failure_total", + MetricInstallSuccessSeconds.HistogramOpts: "hive_cluster_deployment_install_success_total", + } + customMetrics = map[*prometheus.Desc]string{ + metricClusterDeploymentProvisionUnderwaySecondsDesc: "hive_cluster_deployment_provision_underway_seconds", + provisioningUnderwayInstallRestartsCollectorDesc: "hive_cluster_deployment_provision_underway_install_restarts", + metricClusterDeploymentDeprovisionUnderwaySecondsDesc: "hive_cluster_deployment_deprovision_underway_seconds", + metricClusterSyncFailingSeconds: "hive_clustersync_failing_seconds", + } + + ls clusterDeploymentLabelSelectorMetrics +) + +// GetClusterDeploymentLabelSelectors reads the MetricsToReport from the metricsConfig section of HiveConfig and updates metricsWithLabelSelector map +// Todo: Adapt this function to read all customizations as a part of https://issues.redhat.com/browse/HIVE-2618 +func GetClusterDeploymentLabelSelectors(log logrus.FieldLogger, mConfig *metricsconfig.MetricsConfig) error { + var err error + ls = *newClusterDeploymentLabelSelectorMetrics() + for _, entries := range mConfig.MetricsToReport { + for _, name := range entries.MetricNames { + if _, ok := ls.metricsWithLabelSelector[name]; ok { + log.WithError(err).Errorf("Duplicate entries in MetricsConfig.MetricsToReport for %s", name) + return err + } + // metric name must be valid and must support clusterDeploymentLabelSelector customization + if ls.isMetricSupported(name) == false { + log.WithError(err).Errorf("Metric %s either not valid or does not support the feature", name) + return err + } + ls.metricsWithLabelSelector[name] = entries.ClusterDeploymentLabelSelector + } + } + return err +} + +func ShouldLogCounterOpts(c *prometheus.CounterOpts, cd *hivev1.ClusterDeployment, log logrus.FieldLogger) bool { + if name, ok := counterVecs[c]; ok { + ls.shouldLogMetric(name, cd, log) + } + return true +} + +func ShouldLogGaugeVec(gv *prometheus.GaugeVec, cd *hivev1.ClusterDeployment, log logrus.FieldLogger) bool { + if name, ok := gaugeVecs[gv]; ok { + ls.shouldLogMetric(name, cd, log) + } + return true +} + +func ShouldLogHistogramVec(hv *prometheus.HistogramVec, cd *hivev1.ClusterDeployment, log logrus.FieldLogger) bool { + if name, ok := histogramVecs[hv]; ok { + ls.shouldLogMetric(name, cd, log) + } + return true +} + +func ShouldLogHistogramOpts(h *prometheus.HistogramOpts, cd *hivev1.ClusterDeployment, log logrus.FieldLogger) bool { + if name, ok := histogramOpts[h]; ok { + ls.shouldLogMetric(name, cd, log) + } + return true +} + +func ShouldLogCustomMetric(d *prometheus.Desc, cd *hivev1.ClusterDeployment, log logrus.FieldLogger) bool { + if name, ok := customMetrics[d]; ok { + ls.shouldLogMetric(name, cd, log) + } + return true +} diff --git a/pkg/operator/hive/hive_controller.go b/pkg/operator/hive/hive_controller.go index 5a32ca9661c..18ba689154e 100644 --- a/pkg/operator/hive/hive_controller.go +++ b/pkg/operator/hive/hive_controller.go @@ -40,6 +40,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/source" "github.com/openshift/hive/pkg/constants" + controllerMetrics "github.com/openshift/hive/pkg/controller/metrics" "github.com/openshift/hive/pkg/operator/metrics" "github.com/openshift/hive/pkg/operator/util" ) @@ -513,6 +514,15 @@ func (r *ReconcileHiveConfig) Reconcile(ctx context.Context, request reconcile.R return reconcile.Result{}, err } + // Read the metricsConfig section and fetch clusterDeploymentLabelSelectors, error out if it fails validation + err = controllerMetrics.GetClusterDeploymentLabelSelectors(hLog, origHiveConfig.Spec.MetricsConfig) + if err != nil { + hLog.WithError(err).Error("error in metricsConfig") + instance.Status.Conditions = util.SetHiveConfigCondition(instance.Status.Conditions, hivev1.HiveReadyCondition, corev1.ConditionFalse, "ErrorInMetricsConfig", err.Error()) + r.updateHiveConfigStatus(origHiveConfig, instance, hLog, false) + return reconcile.Result{}, err + } + mcConfigHash, err := r.deployConfigMap(hLog, h, instance, metricsConfigConfigMapInfo, namespacesToClean) if err != nil { hLog.WithError(err).Error("error deploying metrics config configmap") diff --git a/vendor/github.com/openshift/hive/apis/hive/v1/metricsconfig/metrics_config.go b/vendor/github.com/openshift/hive/apis/hive/v1/metricsconfig/metrics_config.go index 7a85e3e00d5..8477b03eb14 100644 --- a/vendor/github.com/openshift/hive/apis/hive/v1/metricsconfig/metrics_config.go +++ b/vendor/github.com/openshift/hive/apis/hive/v1/metricsconfig/metrics_config.go @@ -17,4 +17,5 @@ type MetricsConfig struct { // pkg/controller/metrics/metrics_with_dynamic_labels.go // +optional AdditionalClusterDeploymentLabels *map[string]string `json:"additionalClusterDeploymentLabels,omitempty"` + MetricsToReport []MetricsToReport `json:"metricsToReport,omitempty"` } diff --git a/vendor/github.com/openshift/hive/apis/hive/v1/metricsconfig/metrics_to_report.go b/vendor/github.com/openshift/hive/apis/hive/v1/metricsconfig/metrics_to_report.go new file mode 100644 index 00000000000..5aa2542766d --- /dev/null +++ b/vendor/github.com/openshift/hive/apis/hive/v1/metricsconfig/metrics_to_report.go @@ -0,0 +1,15 @@ +package metricsconfig + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// MetricsToReport represents metrics that have additional customizations +type MetricsToReport struct { + // MetricNames is a list of metrics for which the following customizations must be added, if they support the customization + // The name of the metric here must be valid, and it can only be present once in metricsToReport. + MetricNames []string `json:"metricNames"` + // ClusterDeploymentLabelSelector can be used to match cluster deployment label present, it can be used to filter the metrics reported. + // It can only be used with metrics that have their clusterdeployment at hand when they are being reported. + ClusterDeploymentLabelSelector metav1.LabelSelector `json:"clusterDeploymentLabelSelector"` +} diff --git a/vendor/github.com/openshift/hive/apis/hive/v1/metricsconfig/zz_generated.deepcopy.go b/vendor/github.com/openshift/hive/apis/hive/v1/metricsconfig/zz_generated.deepcopy.go index 2f657023dcb..f4ce4f77e23 100644 --- a/vendor/github.com/openshift/hive/apis/hive/v1/metricsconfig/zz_generated.deepcopy.go +++ b/vendor/github.com/openshift/hive/apis/hive/v1/metricsconfig/zz_generated.deepcopy.go @@ -30,6 +30,13 @@ func (in *MetricsConfig) DeepCopyInto(out *MetricsConfig) { } } } + if in.MetricsToReport != nil { + in, out := &in.MetricsToReport, &out.MetricsToReport + *out = make([]MetricsToReport, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } return } @@ -43,6 +50,28 @@ func (in *MetricsConfig) DeepCopy() *MetricsConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MetricsToReport) DeepCopyInto(out *MetricsToReport) { + *out = *in + if in.MetricNames != nil { + in, out := &in.MetricNames, &out.MetricNames + *out = make([]string, len(*in)) + copy(*out, *in) + } + in.ClusterDeploymentLabelSelector.DeepCopyInto(&out.ClusterDeploymentLabelSelector) + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MetricsToReport. +func (in *MetricsToReport) DeepCopy() *MetricsToReport { + if in == nil { + return nil + } + out := new(MetricsToReport) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *MetricsWithDuration) DeepCopyInto(out *MetricsWithDuration) { *out = *in