diff --git a/CHANGELOG.md b/CHANGELOG.md index 6625492106..4d25ed2b79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## 4.13 +- [#1785](https://github.com/openshift/cluster-monitoring-operator/pull/1785) Adds support for CollectionProfiles TechPreview - [#1830](https://github.com/openshift/cluster-monitoring-operator/pull/1830) Add alert KubePodNotScheduled - [#1843](https://github.com/openshift/cluster-monitoring-operator/pull/1843) Node Exporter ignores network interface under name "enP.*". - [#1860](https://github.com/openshift/cluster-monitoring-operator/pull/1860) Adds runbook for PrometheusRuleFailures diff --git a/Documentation/api.md b/Documentation/api.md index c4ac82ac76..03a87cd8f0 100644 --- a/Documentation/api.md +++ b/Documentation/api.md @@ -295,6 +295,7 @@ The `PrometheusK8sConfig` resource defines settings for the Prometheus component | retentionSize | string | Defines the maximum amount of disk space used by data blocks plus the write-ahead log (WAL). Supported values are `B`, `KB`, `KiB`, `MB`, `MiB`, `GB`, `GiB`, `TB`, `TiB`, `PB`, `PiB`, `EB`, and `EiB`. By default, no limit is defined. | | tolerations | [][v1.Toleration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.26/#toleration-v1-core) | Defines tolerations for the pods. | | topologySpreadConstraints | []v1.TopologySpreadConstraint | Defines the pod's topology spread constraints. | +| collectionProfile | CollectionProfile | Defines the metrics collection profile that Prometheus uses to collect metrics from the platform components. Supported values are `full` or `minimal`. In the `full` profile (default), Prometheus collects all metrics that are exposed by the platform components. In the `minimal` profile, Prometheus only collects metrics necessary for the default platform alerts, recording rules, telemetry and console dashboards. | | volumeClaimTemplate | *[monv1.EmbeddedPersistentVolumeClaim](https://github.com/prometheus-operator/prometheus-operator/blob/v0.62.0/Documentation/api.md#embeddedpersistentvolumeclaim) | Defines persistent storage for Prometheus. Use this setting to configure the persistent volume claim, including storage class, volume size and name. | [Back to TOC](#table-of-contents) diff --git a/Documentation/openshiftdocs/modules/prometheusk8sconfig.adoc b/Documentation/openshiftdocs/modules/prometheusk8sconfig.adoc index cc6b009db6..714f224062 100644 --- a/Documentation/openshiftdocs/modules/prometheusk8sconfig.adoc +++ b/Documentation/openshiftdocs/modules/prometheusk8sconfig.adoc @@ -42,6 +42,8 @@ Appears in: link:clustermonitoringconfiguration.adoc[ClusterMonitoringConfigurat |topologySpreadConstraints|[]v1.TopologySpreadConstraint|Defines the pod's topology spread constraints. +|collectionProfile|CollectionProfile|Defines the metrics collection profile that Prometheus uses to collect metrics from the platform components. Supported values are `full` or `minimal`. In the `full` profile (default), Prometheus collects all metrics that are exposed by the platform components. In the `minimal` profile, Prometheus only collects metrics necessary for the default platform alerts, recording rules, telemetry and console dashboards. + |volumeClaimTemplate|*monv1.EmbeddedPersistentVolumeClaim|Defines persistent storage for Prometheus. Use this setting to configure the persistent volume claim, including storage class, volume size and name. |=== diff --git a/assets/control-plane/minimal-service-monitor-etcd.yaml b/assets/control-plane/minimal-service-monitor-etcd.yaml new file mode 100644 index 0000000000..6b81480d42 --- /dev/null +++ b/assets/control-plane/minimal-service-monitor-etcd.yaml @@ -0,0 +1,30 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: etcd + k8s-app: etcd + monitoring.openshift.io/collection-profile: minimal + name: etcd-minimal + namespace: openshift-monitoring +spec: + endpoints: + - interval: 30s + metricRelabelings: + - action: keep + regex: (etcd_disk_backend_commit_duration_seconds_bucket|etcd_disk_wal_fsync_duration_seconds_bucket|etcd_mvcc_db_total_size_in_bytes|etcd_mvcc_db_total_size_in_use_in_bytes|etcd_network_peer_round_trip_time_seconds_bucket|etcd_network_peer_sent_failures_total|etcd_server_has_leader|etcd_server_is_leader|etcd_server_proposals_failed_total|etcd_server_quota_backend_bytes|grpc_server_handled_total|grpc_server_handling_seconds_bucket|grpc_server_started_total|process_start_time_seconds) + sourceLabels: + - __name__ + port: etcd-metrics + scheme: https + tlsConfig: + caFile: /etc/prometheus/secrets/kube-etcd-client-certs/etcd-client-ca.crt + certFile: /etc/prometheus/secrets/kube-etcd-client-certs/etcd-client.crt + keyFile: /etc/prometheus/secrets/kube-etcd-client-certs/etcd-client.key + jobLabel: k8s-app + namespaceSelector: + matchNames: + - openshift-etcd + selector: + matchLabels: + k8s-app: etcd diff --git a/assets/control-plane/minimal-service-monitor-kubelet.yaml b/assets/control-plane/minimal-service-monitor-kubelet.yaml new file mode 100644 index 0000000000..47cff70d0c --- /dev/null +++ b/assets/control-plane/minimal-service-monitor-kubelet.yaml @@ -0,0 +1,106 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: kubelet + app.kubernetes.io/part-of: openshift-monitoring + k8s-app: kubelet + monitoring.openshift.io/collection-profile: minimal + name: kubelet-minimal + namespace: openshift-monitoring +spec: + endpoints: + - bearerTokenFile: "" + honorLabels: true + interval: 30s + metricRelabelings: + - action: keep + regex: (apiserver_audit_event_total|container_cpu_cfs_periods_total|container_cpu_cfs_throttled_periods_total|container_cpu_usage_seconds_total|container_fs_reads_bytes_total|container_fs_reads_total|container_fs_usage_bytes|container_fs_writes_bytes_total|container_fs_writes_total|container_memory_cache|container_memory_rss|container_memory_swap|container_memory_usage_bytes|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_receive_packets_dropped_total|container_network_receive_packets_total|container_network_transmit_bytes_total|container_network_transmit_packets_dropped_total|container_network_transmit_packets_total|container_spec_cpu_shares|kubelet_certificate_manager_client_expiration_renew_errors|kubelet_containers_per_pod_count_sum|kubelet_node_name|kubelet_pleg_relist_duration_seconds_bucket|kubelet_pod_worker_duration_seconds_bucket|kubelet_server_expiration_renew_errors|kubelet_volume_stats_available_bytes|kubelet_volume_stats_capacity_bytes|kubelet_volume_stats_inodes|kubelet_volume_stats_inodes_free|kubelet_volume_stats_inodes_used|kubelet_volume_stats_used_bytes|machine_cpu_cores|machine_memory_bytes|process_start_time_seconds|rest_client_requests_total|storage_operation_duration_seconds_count) + sourceLabels: + - __name__ + port: https-metrics + relabelings: + - sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + scheme: https + scrapeTimeout: 30s + tlsConfig: + caFile: /etc/prometheus/configmaps/kubelet-serving-ca-bundle/ca-bundle.crt + certFile: /etc/prometheus/secrets/metrics-client-certs/tls.crt + insecureSkipVerify: false + keyFile: /etc/prometheus/secrets/metrics-client-certs/tls.key + - bearerTokenFile: "" + honorLabels: true + honorTimestamps: false + interval: 30s + metricRelabelings: + - action: labeldrop + regex: __tmp_keep_metric + - action: keep + regex: (apiserver_audit_event_total|container_cpu_cfs_periods_total|container_cpu_cfs_throttled_periods_total|container_cpu_usage_seconds_total|container_fs_reads_bytes_total|container_fs_reads_total|container_fs_usage_bytes|container_fs_writes_bytes_total|container_fs_writes_total|container_memory_cache|container_memory_rss|container_memory_swap|container_memory_usage_bytes|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_receive_packets_dropped_total|container_network_receive_packets_total|container_network_transmit_bytes_total|container_network_transmit_packets_dropped_total|container_network_transmit_packets_total|container_spec_cpu_shares|kubelet_certificate_manager_client_expiration_renew_errors|kubelet_containers_per_pod_count_sum|kubelet_node_name|kubelet_pleg_relist_duration_seconds_bucket|kubelet_pod_worker_duration_seconds_bucket|kubelet_server_expiration_renew_errors|kubelet_volume_stats_available_bytes|kubelet_volume_stats_capacity_bytes|kubelet_volume_stats_inodes|kubelet_volume_stats_inodes_free|kubelet_volume_stats_inodes_used|kubelet_volume_stats_used_bytes|machine_cpu_cores|machine_memory_bytes|process_start_time_seconds|rest_client_requests_total|storage_operation_duration_seconds_count) + sourceLabels: + - __name__ + path: /metrics/cadvisor + port: https-metrics + relabelings: + - sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + scheme: https + scrapeTimeout: 30s + tlsConfig: + caFile: /etc/prometheus/configmaps/kubelet-serving-ca-bundle/ca-bundle.crt + certFile: /etc/prometheus/secrets/metrics-client-certs/tls.crt + insecureSkipVerify: false + keyFile: /etc/prometheus/secrets/metrics-client-certs/tls.key + - bearerTokenFile: "" + honorLabels: true + interval: 30s + metricRelabelings: + - action: keep + regex: (apiserver_audit_event_total|container_cpu_cfs_periods_total|container_cpu_cfs_throttled_periods_total|container_cpu_usage_seconds_total|container_fs_reads_bytes_total|container_fs_reads_total|container_fs_usage_bytes|container_fs_writes_bytes_total|container_fs_writes_total|container_memory_cache|container_memory_rss|container_memory_swap|container_memory_usage_bytes|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_receive_packets_dropped_total|container_network_receive_packets_total|container_network_transmit_bytes_total|container_network_transmit_packets_dropped_total|container_network_transmit_packets_total|container_spec_cpu_shares|kubelet_certificate_manager_client_expiration_renew_errors|kubelet_containers_per_pod_count_sum|kubelet_node_name|kubelet_pleg_relist_duration_seconds_bucket|kubelet_pod_worker_duration_seconds_bucket|kubelet_server_expiration_renew_errors|kubelet_volume_stats_available_bytes|kubelet_volume_stats_capacity_bytes|kubelet_volume_stats_inodes|kubelet_volume_stats_inodes_free|kubelet_volume_stats_inodes_used|kubelet_volume_stats_used_bytes|machine_cpu_cores|machine_memory_bytes|process_start_time_seconds|rest_client_requests_total|storage_operation_duration_seconds_count) + sourceLabels: + - __name__ + path: /metrics/probes + port: https-metrics + relabelings: + - sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + scheme: https + scrapeTimeout: 30s + tlsConfig: + caFile: /etc/prometheus/configmaps/kubelet-serving-ca-bundle/ca-bundle.crt + certFile: /etc/prometheus/secrets/metrics-client-certs/tls.crt + insecureSkipVerify: false + keyFile: /etc/prometheus/secrets/metrics-client-certs/tls.key + - interval: 30s + metricRelabelings: + - action: keep + regex: (apiserver_audit_event_total|container_cpu_cfs_periods_total|container_cpu_cfs_throttled_periods_total|container_cpu_usage_seconds_total|container_fs_reads_bytes_total|container_fs_reads_total|container_fs_usage_bytes|container_fs_writes_bytes_total|container_fs_writes_total|container_memory_cache|container_memory_rss|container_memory_swap|container_memory_usage_bytes|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_receive_packets_dropped_total|container_network_receive_packets_total|container_network_transmit_bytes_total|container_network_transmit_packets_dropped_total|container_network_transmit_packets_total|container_spec_cpu_shares|kubelet_certificate_manager_client_expiration_renew_errors|kubelet_containers_per_pod_count_sum|kubelet_node_name|kubelet_pleg_relist_duration_seconds_bucket|kubelet_pod_worker_duration_seconds_bucket|kubelet_server_expiration_renew_errors|kubelet_volume_stats_available_bytes|kubelet_volume_stats_capacity_bytes|kubelet_volume_stats_inodes|kubelet_volume_stats_inodes_free|kubelet_volume_stats_inodes_used|kubelet_volume_stats_used_bytes|machine_cpu_cores|machine_memory_bytes|process_start_time_seconds|rest_client_requests_total|storage_operation_duration_seconds_count) + sourceLabels: + - __name__ + port: https-metrics + relabelings: + - action: replace + regex: (.+)(?::\d+) + replacement: $1:9537 + sourceLabels: + - __address__ + targetLabel: __address__ + - action: replace + replacement: crio + sourceLabels: + - endpoint + targetLabel: endpoint + - action: replace + replacement: crio + targetLabel: job + jobLabel: k8s-app + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + k8s-app: kubelet diff --git a/assets/control-plane/service-monitor-etcd.yaml b/assets/control-plane/service-monitor-etcd.yaml index 1e898ded21..7269a0dfab 100644 --- a/assets/control-plane/service-monitor-etcd.yaml +++ b/assets/control-plane/service-monitor-etcd.yaml @@ -4,6 +4,7 @@ metadata: labels: app.kubernetes.io/name: etcd k8s-app: etcd + monitoring.openshift.io/collection-profile: full name: etcd namespace: openshift-monitoring spec: diff --git a/assets/control-plane/service-monitor-kubelet-resource-metrics.yaml b/assets/control-plane/service-monitor-kubelet-resource-metrics.yaml index e5de1b4e89..cb8f4ed710 100644 --- a/assets/control-plane/service-monitor-kubelet-resource-metrics.yaml +++ b/assets/control-plane/service-monitor-kubelet-resource-metrics.yaml @@ -5,6 +5,7 @@ metadata: app.kubernetes.io/name: kubelet app.kubernetes.io/part-of: openshift-monitoring k8s-app: kubelet + monitoring.openshift.io/collection-profile: full name: kubelet-resource-metrics namespace: openshift-monitoring spec: diff --git a/assets/control-plane/service-monitor-kubelet.yaml b/assets/control-plane/service-monitor-kubelet.yaml index 9fea2d6d1e..9f404ced91 100644 --- a/assets/control-plane/service-monitor-kubelet.yaml +++ b/assets/control-plane/service-monitor-kubelet.yaml @@ -5,6 +5,7 @@ metadata: app.kubernetes.io/name: kubelet app.kubernetes.io/part-of: openshift-monitoring k8s-app: kubelet + monitoring.openshift.io/collection-profile: full name: kubelet namespace: openshift-monitoring spec: diff --git a/assets/kube-state-metrics/minimal-service-monitor.yaml b/assets/kube-state-metrics/minimal-service-monitor.yaml new file mode 100644 index 0000000000..482f42df2f --- /dev/null +++ b/assets/kube-state-metrics/minimal-service-monitor.yaml @@ -0,0 +1,57 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: openshift-monitoring + app.kubernetes.io/version: 2.8.1 + monitoring.openshift.io/collection-profile: minimal + name: kube-state-metrics-minimal + namespace: openshift-monitoring +spec: + endpoints: + - bearerTokenFile: "" + honorLabels: true + interval: 1m + metricRelabelings: + - action: labeldrop + regex: instance + - action: keep + regex: (kube_daemonset_status_current_number_scheduled|kube_daemonset_status_desired_number_scheduled|kube_daemonset_status_number_available|kube_daemonset_status_number_misscheduled|kube_daemonset_status_updated_number_scheduled|kube_deployment_metadata_generation|kube_deployment_spec_replicas|kube_deployment_status_observed_generation|kube_deployment_status_replicas_available|kube_deployment_status_replicas_updated|kube_horizontalpodautoscaler_spec_max_replicas|kube_horizontalpodautoscaler_spec_min_replicas|kube_horizontalpodautoscaler_status_current_replicas|kube_horizontalpodautoscaler_status_desired_replicas|kube_job_failed|kube_job_status_active|kube_job_status_start_time|kube_node_info|kube_node_labels|kube_node_role|kube_node_spec_taint|kube_node_spec_unschedulable|kube_node_status_allocatable|kube_node_status_capacity|kube_node_status_condition|kube_persistentvolume_info|kube_persistentvolume_status_phase|kube_persistentvolumeclaim_access_mode|kube_persistentvolumeclaim_info|kube_persistentvolumeclaim_labels|kube_persistentvolumeclaim_resource_requests_storage_bytes|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_container_status_last_terminated_reason|kube_pod_container_status_restarts_total|kube_pod_container_status_waiting_reason|kube_pod_info|kube_pod_owner|kube_pod_status_phase|kube_pod_status_ready|kube_pod_status_unschedulable|kube_poddisruptionbudget_status_current_healthy|kube_poddisruptionbudget_status_desired_healthy|kube_poddisruptionbudget_status_expected_pods|kube_replicaset_owner|kube_replicationcontroller_owner|kube_resourcequota|kube_state_metrics_list_total|kube_state_metrics_watch_total|kube_statefulset_metadata_generation|kube_statefulset_replicas|kube_statefulset_status_current_revision|kube_statefulset_status_observed_generation|kube_statefulset_status_replicas|kube_statefulset_status_replicas_ready|kube_statefulset_status_replicas_updated|kube_statefulset_status_update_revision|kube_storageclass_info|process_start_time_seconds) + sourceLabels: + - __name__ + port: https-main + relabelings: + - action: labeldrop + regex: pod + scheme: https + scrapeTimeout: 1m + tlsConfig: + caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt + certFile: /etc/prometheus/secrets/metrics-client-certs/tls.crt + insecureSkipVerify: false + keyFile: /etc/prometheus/secrets/metrics-client-certs/tls.key + serverName: kube-state-metrics.openshift-monitoring.svc + - bearerTokenFile: "" + interval: 1m + metricRelabelings: + - action: keep + regex: (kube_daemonset_status_current_number_scheduled|kube_daemonset_status_desired_number_scheduled|kube_daemonset_status_number_available|kube_daemonset_status_number_misscheduled|kube_daemonset_status_updated_number_scheduled|kube_deployment_metadata_generation|kube_deployment_spec_replicas|kube_deployment_status_observed_generation|kube_deployment_status_replicas_available|kube_deployment_status_replicas_updated|kube_horizontalpodautoscaler_spec_max_replicas|kube_horizontalpodautoscaler_spec_min_replicas|kube_horizontalpodautoscaler_status_current_replicas|kube_horizontalpodautoscaler_status_desired_replicas|kube_job_failed|kube_job_status_active|kube_job_status_start_time|kube_node_info|kube_node_labels|kube_node_role|kube_node_spec_taint|kube_node_spec_unschedulable|kube_node_status_allocatable|kube_node_status_capacity|kube_node_status_condition|kube_persistentvolume_info|kube_persistentvolume_status_phase|kube_persistentvolumeclaim_access_mode|kube_persistentvolumeclaim_info|kube_persistentvolumeclaim_labels|kube_persistentvolumeclaim_resource_requests_storage_bytes|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_container_status_last_terminated_reason|kube_pod_container_status_restarts_total|kube_pod_container_status_waiting_reason|kube_pod_info|kube_pod_owner|kube_pod_status_phase|kube_pod_status_ready|kube_pod_status_unschedulable|kube_poddisruptionbudget_status_current_healthy|kube_poddisruptionbudget_status_desired_healthy|kube_poddisruptionbudget_status_expected_pods|kube_replicaset_owner|kube_replicationcontroller_owner|kube_resourcequota|kube_state_metrics_list_total|kube_state_metrics_watch_total|kube_statefulset_metadata_generation|kube_statefulset_replicas|kube_statefulset_status_current_revision|kube_statefulset_status_observed_generation|kube_statefulset_status_replicas|kube_statefulset_status_replicas_ready|kube_statefulset_status_replicas_updated|kube_statefulset_status_update_revision|kube_storageclass_info|process_start_time_seconds) + sourceLabels: + - __name__ + port: https-self + scheme: https + scrapeTimeout: 1m + tlsConfig: + caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt + certFile: /etc/prometheus/secrets/metrics-client-certs/tls.crt + insecureSkipVerify: false + keyFile: /etc/prometheus/secrets/metrics-client-certs/tls.key + serverName: kube-state-metrics.openshift-monitoring.svc + jobLabel: app.kubernetes.io/name + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: openshift-monitoring diff --git a/assets/kube-state-metrics/service-monitor.yaml b/assets/kube-state-metrics/service-monitor.yaml index c765012a69..1f4834f779 100644 --- a/assets/kube-state-metrics/service-monitor.yaml +++ b/assets/kube-state-metrics/service-monitor.yaml @@ -6,6 +6,7 @@ metadata: app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/part-of: openshift-monitoring app.kubernetes.io/version: 2.8.1 + monitoring.openshift.io/collection-profile: full name: kube-state-metrics namespace: openshift-monitoring spec: diff --git a/assets/node-exporter/minimal-service-monitor.yaml b/assets/node-exporter/minimal-service-monitor.yaml new file mode 100644 index 0000000000..925f087012 --- /dev/null +++ b/assets/node-exporter/minimal-service-monitor.yaml @@ -0,0 +1,41 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: openshift-monitoring + app.kubernetes.io/version: 1.5.0 + monitoring.openshift.io/collection-profile: minimal + name: node-exporter-minimal + namespace: openshift-monitoring +spec: + endpoints: + - bearerTokenFile: "" + interval: 15s + metricRelabelings: + - action: keep + regex: (node_cpu_info|node_cpu_seconds_total|node_disk_io_time_seconds_total|node_disk_io_time_weighted_seconds_total|node_disk_read_time_seconds_total|node_disk_reads_completed_total|node_disk_write_time_seconds_total|node_disk_writes_completed_total|node_filefd_allocated|node_filefd_maximum|node_filesystem_avail_bytes|node_filesystem_files|node_filesystem_files_free|node_filesystem_free_bytes|node_filesystem_readonly|node_filesystem_size_bytes|node_load1|node_memory_Buffers_bytes|node_memory_Cached_bytes|node_memory_MemAvailable_bytes|node_memory_MemFree_bytes|node_memory_MemTotal_bytes|node_memory_Slab_bytes|node_netstat_TcpExt_TCPSynRetrans|node_netstat_Tcp_OutSegs|node_netstat_Tcp_RetransSegs|node_network_receive_bytes_total|node_network_receive_drop_total|node_network_receive_errs_total|node_network_receive_packets_total|node_network_transmit_bytes_total|node_network_transmit_drop_total|node_network_transmit_errs_total|node_network_transmit_packets_total|node_network_up|node_nf_conntrack_entries|node_nf_conntrack_entries_limit|node_textfile_scrape_error|node_timex_maxerror_seconds|node_timex_offset_seconds|node_timex_sync_status|node_vmstat_pgmajfault|process_start_time_seconds|virt_platform) + sourceLabels: + - __name__ + port: https + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: instance + scheme: https + tlsConfig: + caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt + certFile: /etc/prometheus/secrets/metrics-client-certs/tls.crt + insecureSkipVerify: false + keyFile: /etc/prometheus/secrets/metrics-client-certs/tls.key + serverName: node-exporter.openshift-monitoring.svc + jobLabel: app.kubernetes.io/name + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: openshift-monitoring diff --git a/assets/node-exporter/service-monitor.yaml b/assets/node-exporter/service-monitor.yaml index 79c4864a7f..9d3ba63ed3 100644 --- a/assets/node-exporter/service-monitor.yaml +++ b/assets/node-exporter/service-monitor.yaml @@ -6,6 +6,7 @@ metadata: app.kubernetes.io/name: node-exporter app.kubernetes.io/part-of: openshift-monitoring app.kubernetes.io/version: 1.5.0 + monitoring.openshift.io/collection-profile: full name: node-exporter namespace: openshift-monitoring spec: diff --git a/assets/prometheus-adapter/minimal-service-monitor.yaml b/assets/prometheus-adapter/minimal-service-monitor.yaml new file mode 100644 index 0000000000..905b0c74f6 --- /dev/null +++ b/assets/prometheus-adapter/minimal-service-monitor.yaml @@ -0,0 +1,33 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: openshift-monitoring + app.kubernetes.io/version: 0.10.0 + monitoring.openshift.io/collection-profile: minimal + name: prometheus-adapter-minimal + namespace: openshift-monitoring +spec: + endpoints: + - bearerTokenFile: "" + interval: 30s + metricRelabelings: + - action: keep + regex: (apiserver_audit_event_total|apiserver_current_inflight_requests|apiserver_request_duration_seconds_bucket|apiserver_request_duration_seconds_count|apiserver_request_total|process_start_time_seconds) + sourceLabels: + - __name__ + port: https + scheme: https + tlsConfig: + caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt + certFile: /etc/prometheus/secrets/metrics-client-certs/tls.crt + insecureSkipVerify: false + keyFile: /etc/prometheus/secrets/metrics-client-certs/tls.key + serverName: prometheus-adapter.openshift-monitoring.svc + selector: + matchLabels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: openshift-monitoring diff --git a/assets/prometheus-adapter/service-monitor.yaml b/assets/prometheus-adapter/service-monitor.yaml index 5e4ceeb919..6b8581fec6 100644 --- a/assets/prometheus-adapter/service-monitor.yaml +++ b/assets/prometheus-adapter/service-monitor.yaml @@ -6,6 +6,7 @@ metadata: app.kubernetes.io/name: prometheus-adapter app.kubernetes.io/part-of: openshift-monitoring app.kubernetes.io/version: 0.10.0 + monitoring.openshift.io/collection-profile: full name: prometheus-adapter namespace: openshift-monitoring spec: diff --git a/go.mod b/go.mod index 5ab7705844..199402184f 100644 --- a/go.mod +++ b/go.mod @@ -18,6 +18,8 @@ require ( github.com/prometheus/client_golang v1.14.0 github.com/prometheus/common v0.39.0 github.com/prometheus/prometheus v0.41.0 + github.com/stretchr/testify v1.8.1 + golang.org/x/exp v0.0.0-20221212164502-fae10dda9338 golang.org/x/sync v0.1.0 gopkg.in/yaml.v2 v2.4.0 gopkg.in/yaml.v3 v3.0.1 @@ -121,7 +123,6 @@ require ( github.com/spf13/cobra v1.6.1 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/stoewer/go-strcase v1.2.0 // indirect - github.com/stretchr/testify v1.8.1 // indirect github.com/thanos-io/thanos v0.30.0 // indirect github.com/xlab/treeprint v1.1.0 // indirect go.etcd.io/etcd/api/v3 v3.5.5 // indirect @@ -144,7 +145,6 @@ require ( go.uber.org/multierr v1.8.0 // indirect go.uber.org/zap v1.21.0 // indirect golang.org/x/crypto v0.1.0 // indirect - golang.org/x/exp v0.0.0-20221212164502-fae10dda9338 // indirect golang.org/x/net v0.4.0 // indirect golang.org/x/oauth2 v0.3.0 // indirect golang.org/x/sys v0.3.0 // indirect diff --git a/jsonnet/components/control-plane.libsonnet b/jsonnet/components/control-plane.libsonnet index 276bdaab4f..f059ebe7b2 100644 --- a/jsonnet/components/control-plane.libsonnet +++ b/jsonnet/components/control-plane.libsonnet @@ -1,3 +1,4 @@ +local generateServiceMonitor = import '../utils/generate-service-monitors.libsonnet'; local controlPlane = import 'github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet'; function(params) @@ -18,6 +19,7 @@ function(params) labels: { 'app.kubernetes.io/name': 'etcd', 'k8s-app': 'etcd', + 'monitoring.openshift.io/collection-profile': 'full', }, }, spec: { @@ -46,12 +48,33 @@ function(params) }, }, + minimalServiceMonitorEtcd: generateServiceMonitor.minimal( + self.serviceMonitorEtcd, std.join('|', + [ + 'etcd_disk_backend_commit_duration_seconds_bucket', + 'etcd_disk_wal_fsync_duration_seconds_bucket', + 'etcd_mvcc_db_total_size_in_bytes', + 'etcd_mvcc_db_total_size_in_use_in_bytes', + 'etcd_network_peer_round_trip_time_seconds_bucket', + 'etcd_network_peer_sent_failures_total', + 'etcd_server_has_leader', + 'etcd_server_is_leader', + 'etcd_server_proposals_failed_total', + 'etcd_server_quota_backend_bytes', + 'grpc_server_handled_total', + 'grpc_server_handling_seconds_bucket', + 'grpc_server_started_total', + 'process_start_time_seconds', + ]) + ), + // This changes the kubelet's certificates to be validated when // scraping. serviceMonitorKubelet+: { metadata+: { labels+: { 'k8s-app': 'kubelet', + 'monitoring.openshift.io/collection-profile': 'full', }, }, spec+: { @@ -136,6 +159,50 @@ function(params) }, }, + minimalServiceMonitorKubelet: generateServiceMonitor.minimal( + self.serviceMonitorKubelet, std.join('|', + [ + 'apiserver_audit_event_total', + 'container_cpu_cfs_periods_total', + 'container_cpu_cfs_throttled_periods_total', + 'container_cpu_usage_seconds_total', + 'container_fs_reads_bytes_total', + 'container_fs_reads_total', + 'container_fs_usage_bytes', + 'container_fs_writes_bytes_total', + 'container_fs_writes_total', + 'container_memory_cache', + 'container_memory_rss', + 'container_memory_swap', + 'container_memory_usage_bytes', + 'container_memory_working_set_bytes', + 'container_network_receive_bytes_total', + 'container_network_receive_packets_dropped_total', + 'container_network_receive_packets_total', + 'container_network_transmit_bytes_total', + 'container_network_transmit_packets_dropped_total', + 'container_network_transmit_packets_total', + 'container_spec_cpu_shares', + 'kubelet_certificate_manager_client_expiration_renew_errors', + 'kubelet_containers_per_pod_count_sum', + 'kubelet_node_name', + 'kubelet_pleg_relist_duration_seconds_bucket', + 'kubelet_pod_worker_duration_seconds_bucket', + 'kubelet_server_expiration_renew_errors', + 'kubelet_volume_stats_available_bytes', + 'kubelet_volume_stats_capacity_bytes', + 'kubelet_volume_stats_inodes', + 'kubelet_volume_stats_inodes_free', + 'kubelet_volume_stats_inodes_used', + 'kubelet_volume_stats_used_bytes', + 'machine_cpu_cores', + 'machine_memory_bytes', + 'process_start_time_seconds', + 'rest_client_requests_total', + 'storage_operation_duration_seconds_count', + ]) + ), + // This adds a kubelet ServiceMonitor for special use with // prometheus-adapter if enabled by the configuration of the cluster monitoring operator. serviceMonitorKubeletResourceMetrics: self.serviceMonitorKubelet { diff --git a/jsonnet/components/kube-state-metrics.libsonnet b/jsonnet/components/kube-state-metrics.libsonnet index 5541f89a43..6e14fa4b9c 100644 --- a/jsonnet/components/kube-state-metrics.libsonnet +++ b/jsonnet/components/kube-state-metrics.libsonnet @@ -3,6 +3,7 @@ local tlsVolumeName = 'kube-state-metrics-tls'; local kubeStateMetrics = import 'github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/components/kube-state-metrics.libsonnet'; local generateSecret = import '../utils/generate-secret.libsonnet'; +local generateServiceMonitor = import '../utils/generate-service-monitors.libsonnet'; function(params) local cfg = params; @@ -22,6 +23,12 @@ function(params) // This changes kube-state-metrics to be scraped with validating TLS. serviceMonitor+: { + metadata+: { + name: super.name, + labels+: { + 'monitoring.openshift.io/collection-profile': 'full', + }, + }, spec+: { endpoints: [ { @@ -61,6 +68,71 @@ function(params) }, }, + minimalServiceMonitor: generateServiceMonitor.minimal( + self.serviceMonitor, std.join('|', + [ + 'kube_daemonset_status_current_number_scheduled', + 'kube_daemonset_status_desired_number_scheduled', + 'kube_daemonset_status_number_available', + 'kube_daemonset_status_number_misscheduled', + 'kube_daemonset_status_updated_number_scheduled', + 'kube_deployment_metadata_generation', + 'kube_deployment_spec_replicas', + 'kube_deployment_status_observed_generation', + 'kube_deployment_status_replicas_available', + 'kube_deployment_status_replicas_updated', + 'kube_horizontalpodautoscaler_spec_max_replicas', + 'kube_horizontalpodautoscaler_spec_min_replicas', + 'kube_horizontalpodautoscaler_status_current_replicas', + 'kube_horizontalpodautoscaler_status_desired_replicas', + 'kube_job_failed', + 'kube_job_status_active', + 'kube_job_status_start_time', + 'kube_node_info', + 'kube_node_labels', + 'kube_node_role', + 'kube_node_spec_taint', + 'kube_node_spec_unschedulable', + 'kube_node_status_allocatable', + 'kube_node_status_capacity', + 'kube_node_status_condition', + 'kube_persistentvolume_info', + 'kube_persistentvolume_status_phase', + 'kube_persistentvolumeclaim_access_mode', + 'kube_persistentvolumeclaim_info', + 'kube_persistentvolumeclaim_labels', + 'kube_persistentvolumeclaim_resource_requests_storage_bytes', + 'kube_pod_container_resource_limits', + 'kube_pod_container_resource_requests', + 'kube_pod_container_status_last_terminated_reason', + 'kube_pod_container_status_restarts_total', + 'kube_pod_container_status_waiting_reason', + 'kube_pod_info', + 'kube_pod_owner', + 'kube_pod_status_phase', + 'kube_pod_status_ready', + 'kube_pod_status_unschedulable', + 'kube_poddisruptionbudget_status_current_healthy', + 'kube_poddisruptionbudget_status_desired_healthy', + 'kube_poddisruptionbudget_status_expected_pods', + 'kube_replicaset_owner', + 'kube_replicationcontroller_owner', + 'kube_resourcequota', + 'kube_state_metrics_list_total', + 'kube_state_metrics_watch_total', + 'kube_statefulset_metadata_generation', + 'kube_statefulset_replicas', + 'kube_statefulset_status_current_revision', + 'kube_statefulset_status_observed_generation', + 'kube_statefulset_status_replicas', + 'kube_statefulset_status_replicas_ready', + 'kube_statefulset_status_replicas_updated', + 'kube_statefulset_status_update_revision', + 'kube_storageclass_info', + 'process_start_time_seconds', + ]) + ), + kubeRbacProxySecret: generateSecret.staticAuthSecret(cfg.namespace, cfg.commonLabels, 'kube-state-metrics-kube-rbac-proxy-config'), // This removes the upstream addon-resizer and all resource requests and diff --git a/jsonnet/components/node-exporter.libsonnet b/jsonnet/components/node-exporter.libsonnet index ae2a246d27..129e192a1a 100644 --- a/jsonnet/components/node-exporter.libsonnet +++ b/jsonnet/components/node-exporter.libsonnet @@ -6,6 +6,7 @@ local wtmpVolumeName = 'node-exporter-wtmp'; local nodeExporter = import 'github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/components/node-exporter.libsonnet'; local generateSecret = import '../utils/generate-secret.libsonnet'; +local generateServiceMonitor = import '../utils/generate-service-monitors.libsonnet'; function(params) local cfg = params; @@ -23,6 +24,65 @@ function(params) }, }, + // This changes node-exporter to be scraped with validating TLS. + serviceMonitor+: { + metadata+: { + labels+: { + 'monitoring.openshift.io/collection-profile': 'full', + }, + }, + }, + + minimalServiceMonitor: generateServiceMonitor.minimal( + self.serviceMonitor, std.join('|', + [ + 'node_cpu_info', + 'node_cpu_seconds_total', + 'node_disk_io_time_seconds_total', + 'node_disk_io_time_weighted_seconds_total', + 'node_disk_read_time_seconds_total', + 'node_disk_reads_completed_total', + 'node_disk_write_time_seconds_total', + 'node_disk_writes_completed_total', + 'node_filefd_allocated', + 'node_filefd_maximum', + 'node_filesystem_avail_bytes', + 'node_filesystem_files', + 'node_filesystem_files_free', + 'node_filesystem_free_bytes', + 'node_filesystem_readonly', + 'node_filesystem_size_bytes', + 'node_load1', + 'node_memory_Buffers_bytes', + 'node_memory_Cached_bytes', + 'node_memory_MemAvailable_bytes', + 'node_memory_MemFree_bytes', + 'node_memory_MemTotal_bytes', + 'node_memory_Slab_bytes', + 'node_netstat_TcpExt_TCPSynRetrans', + 'node_netstat_Tcp_OutSegs', + 'node_netstat_Tcp_RetransSegs', + 'node_network_receive_bytes_total', + 'node_network_receive_drop_total', + 'node_network_receive_errs_total', + 'node_network_receive_packets_total', + 'node_network_transmit_bytes_total', + 'node_network_transmit_drop_total', + 'node_network_transmit_errs_total', + 'node_network_transmit_packets_total', + 'node_network_up', + 'node_nf_conntrack_entries', + 'node_nf_conntrack_entries_limit', + 'node_textfile_scrape_error', + 'node_timex_maxerror_seconds', + 'node_timex_offset_seconds', + 'node_timex_sync_status', + 'node_vmstat_pgmajfault', + 'process_start_time_seconds', + 'virt_platform', + ]) + ), + securityContextConstraints: { allowHostDirVolumePlugin: true, allowHostNetwork: true, diff --git a/jsonnet/components/prometheus-adapter.libsonnet b/jsonnet/components/prometheus-adapter.libsonnet index fb45eedac7..56ceda7578 100644 --- a/jsonnet/components/prometheus-adapter.libsonnet +++ b/jsonnet/components/prometheus-adapter.libsonnet @@ -12,6 +12,7 @@ local servingCertsCABundleFileName = 'service-ca.crt'; local servingCertsCABundleMountPath = '/etc/%s' % servingCertsCABundleDirectory; local generateCertInjection = import '../utils/generate-certificate-injection.libsonnet'; +local generateServiceMonitor = import '../utils/generate-service-monitors.libsonnet'; local prometheusAdapter = (import 'github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/components/prometheus-adapter.libsonnet'); @@ -71,6 +72,26 @@ function(params) }, }, + serviceMonitor+: { + metadata+: { + labels+: { + 'monitoring.openshift.io/collection-profile': 'full', + }, + }, + }, + + minimalServiceMonitor: generateServiceMonitor.minimal( + self.serviceMonitor, std.join('|', + [ + 'apiserver_audit_event_total', + 'apiserver_current_inflight_requests', + 'apiserver_request_duration_seconds_bucket', + 'apiserver_request_duration_seconds_count', + 'apiserver_request_total', + 'process_start_time_seconds', + ]) + ), + deployment+: { metadata+: { diff --git a/jsonnet/utils/configure-authentication-for-monitors.libsonnet b/jsonnet/utils/configure-authentication-for-monitors.libsonnet index 22206a13a4..7ed38f785b 100644 --- a/jsonnet/utils/configure-authentication-for-monitors.libsonnet +++ b/jsonnet/utils/configure-authentication-for-monitors.libsonnet @@ -1,7 +1,7 @@ { configureAuthenticationForMonitors(o): { local configureAuthentication(o) = o { - [if (o.kind == 'ServiceMonitor' && o.metadata.name != 'etcd') || o.kind == 'PodMonitor' then 'spec']+: { + [if (o.kind == 'ServiceMonitor' && !std.startsWith(o.metadata.name, 'etcd')) || o.kind == 'PodMonitor' then 'spec']+: { [if o.kind == 'ServiceMonitor' then 'endpoints' else 'podMetricsEndpoints']: [ if std.objectHas(e, 'scheme') && e.scheme == 'https' then e { @@ -14,7 +14,25 @@ if !(std.objectHas(o.metadata.labels, 'app.kubernetes.io/name') && o.metadata.labels['app.kubernetes.io/name'] == 'kubelet') then { caFile: '/etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt', - serverName: std.format('%s.%s.svc', [if o.metadata.name != 'thanos-sidecar' then o.metadata.name else 'prometheus-' + o.metadata.labels['app.kubernetes.io/instance'] + '-' + o.metadata.name, o.metadata.namespace]), + // For setting serverName the following logic is applied: + // 1. Prometheus thanos sidecar, the SA that is created for thanos sidescars has a + // different name than the ServiceMonitor. The name format follows the following convention + // "prometheus-$PROM_INSTANCE-thanos-sidecar", $PROM_INSTANCE is either "k8s" or "user-workload" + // 2. ServiceMonitors that adopted CollectionProfiles end with -$COLLECTION_PROFILE, + // thus we strip - and $PROFILE_NAME from o.metadata.name + // 3. Default behaviour for the majority of ServiceMonitors. ServiceMonitor has the same + // name as the SA + serverName: std.format('%s.%s.svc', + [ + if o.metadata.name == 'thanos-sidecar' then + 'prometheus-' + o.metadata.labels['app.kubernetes.io/instance'] + '-' + o.metadata.name + else + if std.objectHas(o.metadata.labels, 'monitoring.openshift.io/collection-profile') then + std.rstripChars(o.metadata.name, '-' + o.metadata.labels['monitoring.openshift.io/collection-profile']) + else + o.metadata.name, + o.metadata.namespace, + ]), } else {}, diff --git a/jsonnet/utils/generate-service-monitors.libsonnet b/jsonnet/utils/generate-service-monitors.libsonnet new file mode 100644 index 0000000000..0e4ee92558 --- /dev/null +++ b/jsonnet/utils/generate-service-monitors.libsonnet @@ -0,0 +1,43 @@ +{ + local minimalLabel = { + 'monitoring.openshift.io/collection-profile': 'minimal', + }, + // 1. Add the prefix minimal to the ServiceMonitor name + // 2. Add the minimal label "monitoring.openshift.io/collection-profile: minimal" + // 3. Add a metricRelabelings with action keep and regex equal to metrics + local minimal(sm, metrics) = sm { + metadata+: { + name+: '-minimal', + labels+: minimalLabel, + }, + spec+: { + endpoints: std.map( + function(e) e { + metricRelabelings+: [ + { + sourceLabels: ['__name__'], + action: 'keep', + regex: '(' + metrics + ')', + }, + ], + }, sm.spec.endpoints + ), + }, + }, + // Removes all metricRelabeling's with the action "drop" from + // ServiceMonitor.spec.endpoint.metricRelabelings + local removeDrop(sm) = sm { + spec+: { + endpoints: std.map( + function(e) e + + if std.objectHas(e, 'metricRelabelings') then + { + metricRelabelings: [x for x in e.metricRelabelings if std.objectHas(x, 'action') && x.action != 'drop'], + } else {}, + sm.spec.endpoints + ), + }, + }, + + minimal(sm, metrics): minimal(removeDrop(sm), metrics), +} diff --git a/pkg/manifests/config.go b/pkg/manifests/config.go index 1dfcdcd2da..d34672d798 100644 --- a/pkg/manifests/config.go +++ b/pkg/manifests/config.go @@ -21,9 +21,12 @@ import ( "fmt" "io" "math" + "strings" configv1 "github.com/openshift/api/config/v1" + "github.com/pkg/errors" poperator "github.com/prometheus-operator/prometheus-operator/pkg/operator" + "golang.org/x/exp/slices" v1 "k8s.io/api/core/v1" k8syaml "k8s.io/apimachinery/pkg/util/yaml" auditv1 "k8s.io/apiserver/pkg/apis/audit/v1" @@ -47,6 +50,7 @@ const ( type Config struct { Images *Images `json:"-"` RemoteWrite bool `json:"-"` + TechPreview bool `json:"-"` ClusterMonitoringConfiguration *ClusterMonitoringConfiguration `json:"-"` UserWorkloadConfiguration *UserWorkloadConfiguration `json:"-"` @@ -186,7 +190,17 @@ func (cfg *TelemeterClientConfig) IsEnabled() bool { return true } -func NewConfig(content io.Reader) (*Config, error) { +func (cps CollectionProfiles) String() string { + var sb strings.Builder + for i := 0; i < len(cps)-1; i++ { + sb.WriteString(string(cps[i])) + sb.WriteString(", ") + } + sb.WriteString(string(cps[len(cps)-1])) + return sb.String() +} + +func NewConfig(content io.Reader, tp bool) (*Config, error) { c := Config{} cmc := defaultClusterMonitoringConfiguration() err := k8syaml.NewYAMLOrJSONDecoder(content, 4096).Decode(&cmc) @@ -197,6 +211,17 @@ func NewConfig(content io.Reader) (*Config, error) { res := &c res.applyDefaults() c.UserWorkloadConfiguration = NewDefaultUserWorkloadMonitoringConfig() + // The operator should only create some manifests if techPreview is enabled + c.TechPreview = tp + + if c.ClusterMonitoringConfiguration.PrometheusK8sConfig.CollectionProfile != FullCollectionProfile && !tp { + return nil, errors.Wrap(ErrConfigValidation, "collectionProfiles is a TechPreview feature, to be able to use a profile different from the default (\"full\") please enable TechPreview") + } + // Validate CollectionProfile field + if !slices.Contains(SupportedCollectionProfiles, c.ClusterMonitoringConfiguration.PrometheusK8sConfig.CollectionProfile) { + return nil, errors.Wrap(ErrConfigValidation, fmt.Sprintf(`%q is not supported, supported collection profiles are: %q`, c.ClusterMonitoringConfiguration.PrometheusK8sConfig.CollectionProfile, SupportedCollectionProfiles.String())) + } + return res, nil } @@ -269,6 +294,10 @@ func (c *Config) applyDefaults() { if c.ClusterMonitoringConfiguration.EtcdConfig == nil { c.ClusterMonitoringConfiguration.EtcdConfig = &EtcdConfig{} } + + if c.ClusterMonitoringConfiguration.PrometheusK8sConfig.CollectionProfile == "" { + c.ClusterMonitoringConfiguration.PrometheusK8sConfig.CollectionProfile = FullCollectionProfile + } } func (c *Config) SetImages(images map[string]string) { @@ -394,12 +423,17 @@ func calculateBodySizeLimit(podCapacity int) string { return fmt.Sprintf("%dMB", int(math.Ceil(float64(bodySize)/(1024*1024)))) } -func NewConfigFromString(content string) (*Config, error) { +// NewConfigFromString transforms a string containing configuration in the +// openshift-monitoring/cluster-monitoring-configuration format into a data +// struture that facilitates programmatical checks of that configuration. The +// content of the data structure might change if TechPreview is enabeld (tp), as +// some features are only meant for TechPreview. +func NewConfigFromString(content string, tp bool) (*Config, error) { if content == "" { return NewDefaultConfig(), nil } - return NewConfig(bytes.NewBuffer([]byte(content))) + return NewConfig(bytes.NewBuffer([]byte(content)), tp) } func NewDefaultConfig() *Config { diff --git a/pkg/manifests/config_test.go b/pkg/manifests/config_test.go index ca64ec7c7b..90da9d22b2 100644 --- a/pkg/manifests/config_test.go +++ b/pkg/manifests/config_test.go @@ -15,12 +15,13 @@ package manifests import ( - "bytes" "context" "errors" "io/ioutil" "os" "testing" + + "github.com/stretchr/testify/require" ) func TestConfigParsing(t *testing.T) { @@ -28,7 +29,7 @@ func TestConfigParsing(t *testing.T) { if err != nil { t.Fatal(err) } - c, err := NewConfig(f) + c, err := NewConfig(f, false) if err != nil { t.Fatal(err) } @@ -61,7 +62,7 @@ func TestNewUserConfigFromStringParsing(t *testing.T) { } func TestEmptyConfigIsValid(t *testing.T) { - _, err := NewConfigFromString("") + _, err := NewConfigFromString("", false) if err != nil { t.Fatal(err) } @@ -174,14 +175,14 @@ func TestTelemeterClientConfig(t *testing.T) { } func TestEtcdDefaultsToDisabled(t *testing.T) { - c, err := NewConfigFromString("") + c, err := NewConfigFromString("", false) if err != nil { t.Fatal(err) } if c.ClusterMonitoringConfiguration.EtcdConfig.IsEnabled() { t.Error("an empty configuration should have etcd disabled") } - c, err = NewConfigFromString(`{"etcd":{}}`) + c, err = NewConfigFromString(`{"etcd":{}}`, false) if err != nil { t.Fatal(err) } @@ -191,21 +192,21 @@ func TestEtcdDefaultsToDisabled(t *testing.T) { } func TestPromAdapterDedicatedSMsDefaultsToDisabled(t *testing.T) { - c, err := NewConfigFromString("") + c, err := NewConfigFromString("", false) if err != nil { t.Fatal(err) } if c.ClusterMonitoringConfiguration.K8sPrometheusAdapter.DedicatedServiceMonitors.Enabled { t.Error("an empty configuration should have prometheus-adapter dedicated ServiceMonitors dislabled") } - c, err = NewConfigFromString(`{"k8sPrometheusAdapter":{}}`) + c, err = NewConfigFromString(`{"k8sPrometheusAdapter":{}}`, false) if err != nil { t.Fatal(err) } if c.ClusterMonitoringConfiguration.K8sPrometheusAdapter.DedicatedServiceMonitors.Enabled { t.Error("an empty k8sPrometheusAdapter configuration should have prometheus-adapter dedicated ServiceMonitors dislabled") } - c, err = NewConfigFromString(`{"k8sPrometheusAdapter":{"dedicatedServiceMonitors":{}}}`) + c, err = NewConfigFromString(`{"k8sPrometheusAdapter":{"dedicatedServiceMonitors":{}}}`, false) if err != nil { t.Fatal(err) } @@ -221,7 +222,7 @@ func TestHttpProxyConfig(t *testing.T) { noProxy: https://example.com ` - c, err := NewConfig(bytes.NewBufferString(conf)) + c, err := NewConfigFromString(conf, false) if err != nil { t.Errorf("expected no error parsing config - %v", err) } @@ -333,7 +334,7 @@ func TestLoadEnforcedBodySizeLimit(t *testing.T) { }, } { t.Run(tt.name, func(t *testing.T) { - c, err := NewConfigFromString(tt.config) + c, err := NewConfigFromString(tt.config, false) if err != nil { t.Fatalf("config parsing error") } @@ -357,3 +358,57 @@ func TestLoadEnforcedBodySizeLimit(t *testing.T) { }) } } + +func TestCollectionProfile(t *testing.T) { + for _, tc := range []struct { + name string + config string + expectedsp CollectionProfile + expectedError bool + }{ + { + name: "default", + config: "", + expectedsp: CollectionProfile("full"), + expectedError: false, + }, + { + name: "full_profile", + config: `prometheusk8s: + collectionProfile: full + `, + expectedsp: CollectionProfile("full"), + expectedError: false, + }, + { + name: "minimal_profile", + config: `prometheusk8s: + collectionProfile: minimal + `, + expectedsp: CollectionProfile("minimal"), + expectedError: false, + }, + { + name: "incorrect_profile", + config: `prometheusk8s: + collectionProfile: foo + `, + expectedsp: "", + expectedError: true, + }, + } { + t.Run(tc.name, func(t *testing.T) { + c, err := NewConfigFromString(tc.config, true) + if err != nil { + if tc.expectedError { + return + } + require.NoError(t, err) + } + + if tc.expectedsp != c.ClusterMonitoringConfiguration.PrometheusK8sConfig.CollectionProfile { + t.Fatalf("incorrect collection profile set, expected %s got %s", tc.expectedsp, c.ClusterMonitoringConfiguration.PrometheusK8sConfig.CollectionProfile) + } + }) + } +} diff --git a/pkg/manifests/manifests.go b/pkg/manifests/manifests.go index 7ef59890f4..9cd9e766a5 100644 --- a/pkg/manifests/manifests.go +++ b/pkg/manifests/manifests.go @@ -64,6 +64,8 @@ const ( userWorkloadAlertmanagerService = "alertmanager-user-workload" telemetryTokenSecretKey = "token" + + collectionProfileLabel = "monitoring.openshift.io/collection-profile" ) var ( @@ -94,14 +96,15 @@ var ( AlertmanagerUserWorkloadPodDisruptionBudget = "alertmanager-user-workload/pod-disruption-budget.yaml" AlertmanagerUserWorkloadServiceMonitor = "alertmanager-user-workload/service-monitor.yaml" - KubeStateMetricsClusterRoleBinding = "kube-state-metrics/cluster-role-binding.yaml" - KubeStateMetricsClusterRole = "kube-state-metrics/cluster-role.yaml" - KubeStateMetricsDeployment = "kube-state-metrics/deployment.yaml" - KubeStateMetricsServiceAccount = "kube-state-metrics/service-account.yaml" - KubeStateMetricsService = "kube-state-metrics/service.yaml" - KubeStateMetricsServiceMonitor = "kube-state-metrics/service-monitor.yaml" - KubeStateMetricsPrometheusRule = "kube-state-metrics/prometheus-rule.yaml" - KubeStateMetricsKubeRbacProxySecret = "kube-state-metrics/kube-rbac-proxy-secret.yaml" + KubeStateMetricsClusterRoleBinding = "kube-state-metrics/cluster-role-binding.yaml" + KubeStateMetricsClusterRole = "kube-state-metrics/cluster-role.yaml" + KubeStateMetricsDeployment = "kube-state-metrics/deployment.yaml" + KubeStateMetricsServiceAccount = "kube-state-metrics/service-account.yaml" + KubeStateMetricsService = "kube-state-metrics/service.yaml" + KubeStateMetricsServiceMonitor = "kube-state-metrics/service-monitor.yaml" + KubeStateMetricsMinimalServiceMonitor = "kube-state-metrics/minimal-service-monitor.yaml" + KubeStateMetricsPrometheusRule = "kube-state-metrics/prometheus-rule.yaml" + KubeStateMetricsKubeRbacProxySecret = "kube-state-metrics/kube-rbac-proxy-secret.yaml" OpenShiftStateMetricsClusterRoleBinding = "openshift-state-metrics/cluster-role-binding.yaml" OpenShiftStateMetricsClusterRole = "openshift-state-metrics/cluster-role.yaml" @@ -118,6 +121,7 @@ var ( NodeExporterClusterRoleBinding = "node-exporter/cluster-role-binding.yaml" NodeExporterSecurityContextConstraints = "node-exporter/security-context-constraints.yaml" NodeExporterServiceMonitor = "node-exporter/service-monitor.yaml" + NodeExporterMinimalServiceMonitor = "node-exporter/minimal-service-monitor.yaml" NodeExporterPrometheusRule = "node-exporter/prometheus-rule.yaml" NodeExporterKubeRbacProxySecret = "node-exporter/kube-rbac-proxy-secret.yaml" @@ -184,6 +188,7 @@ var ( PrometheusAdapterRoleBindingAuthReader = "prometheus-adapter/role-binding-auth-reader.yaml" PrometheusAdapterService = "prometheus-adapter/service.yaml" PrometheusAdapterServiceMonitor = "prometheus-adapter/service-monitor.yaml" + PrometheusAdapterMinimalServiceMonitor = "prometheus-adapter/minimal-service-monitor.yaml" PrometheusAdapterServiceAccount = "prometheus-adapter/service-account.yaml" AdmissionWebhookRuleValidatingWebhook = "admission-webhook/prometheus-rule-validating-webhook.yaml" @@ -272,10 +277,12 @@ var ( TelemeterTrustedCABundle = "telemeter-client/trusted-ca-bundle.yaml" - ControlPlanePrometheusRule = "control-plane/prometheus-rule.yaml" - ControlPlaneKubeletServiceMonitor = "control-plane/service-monitor-kubelet.yaml" - ControlPlaneKubeletServiceMonitorPA = "control-plane/service-monitor-kubelet-resource-metrics.yaml" - ControlPlaneEtcdServiceMonitor = "control-plane/service-monitor-etcd.yaml" + ControlPlanePrometheusRule = "control-plane/prometheus-rule.yaml" + ControlPlaneKubeletServiceMonitor = "control-plane/service-monitor-kubelet.yaml" + ControlPlaneKubeletMinimalServiceMonitor = "control-plane/minimal-service-monitor-kubelet.yaml" + ControlPlaneKubeletServiceMonitorPA = "control-plane/service-monitor-kubelet-resource-metrics.yaml" + ControlPlaneEtcdServiceMonitor = "control-plane/service-monitor-etcd.yaml" + ControlPlaneEtcdMinimalServiceMonitor = "control-plane/minimal-service-monitor-etcd.yaml" ) var ( @@ -703,10 +710,18 @@ func (f *Factory) KubeStateMetricsClusterRole() (*rbacv1.ClusterRole, error) { return f.NewClusterRole(f.assets.MustNewAssetReader(KubeStateMetricsClusterRole)) } +func (f *Factory) KubeStateMetricsServiceMonitors() ([]*monv1.ServiceMonitor, error) { + return serviceMonitors(f.config.TechPreview, f.KubeStateMetricsServiceMonitor, f.KubeStateMetricsMinimalServiceMonitor) +} + func (f *Factory) KubeStateMetricsServiceMonitor() (*monv1.ServiceMonitor, error) { return f.NewServiceMonitor(f.assets.MustNewAssetReader(KubeStateMetricsServiceMonitor)) } +func (f *Factory) KubeStateMetricsMinimalServiceMonitor() (*monv1.ServiceMonitor, error) { + return f.NewServiceMonitor(f.assets.MustNewAssetReader(KubeStateMetricsMinimalServiceMonitor)) +} + func (f *Factory) KubeStateMetricsDeployment() (*appsv1.Deployment, error) { d, err := f.NewDeployment(f.assets.MustNewAssetReader(KubeStateMetricsDeployment)) if err != nil { @@ -801,6 +816,10 @@ func (f *Factory) OpenShiftStateMetricsRBACProxySecret() (*v1.Secret, error) { return f.NewSecret(f.assets.MustNewAssetReader(OpenShiftStateMetricsKubeRbacProxySecret)) } +func (f *Factory) NodeExporterServiceMonitors() ([]*monv1.ServiceMonitor, error) { + return serviceMonitors(f.config.TechPreview, f.NodeExporterServiceMonitor, f.NodeExporterMinimalServiceMonitor) +} + func (f *Factory) NodeExporterServiceMonitor() (*monv1.ServiceMonitor, error) { return f.NewServiceMonitor(f.assets.MustNewAssetReader(NodeExporterServiceMonitor)) } @@ -827,6 +846,10 @@ func (f *Factory) updateNodeExporterArgs(args []string) []string { return args } +func (f *Factory) NodeExporterMinimalServiceMonitor() (*monv1.ServiceMonitor, error) { + return f.NewServiceMonitor(f.assets.MustNewAssetReader(NodeExporterMinimalServiceMonitor)) +} + func (f *Factory) NodeExporterDaemonSet() (*appsv1.DaemonSet, error) { ds, err := f.NewDaemonSet(f.assets.MustNewAssetReader(NodeExporterDaemonSet)) if err != nil { @@ -1248,6 +1271,10 @@ func (f *Factory) PrometheusK8s(grpcTLS *v1.Secret, trustedCABundleCM *v1.Config return nil, err } + if err := setupProfilesToIgnore(p, f.config.ClusterMonitoringConfiguration.PrometheusK8sConfig.CollectionProfile); err != nil { + return nil, err + } + clusterID := f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.ClusterID if f.config.ClusterMonitoringConfiguration.TelemeterClientConfig.IsEnabled() && f.config.RemoteWrite { selectorRelabelConfig, err := promqlgen.LabelSelectorsToRelabelConfig(f.config.ClusterMonitoringConfiguration.PrometheusK8sConfig.TelemetryMatches) @@ -1496,6 +1523,41 @@ func (f *Factory) setupQueryLogFile(p *monv1.Prometheus, queryLogFile string) er return nil } +// setupProfilesToIgnore configures the label selectors of the Prometheus ("p") +// to select any ServiceMonitor's or PodMonitor's that doesn't have the scrape +// profile label or that matches the CollectionProfile ("cp"). +func setupProfilesToIgnore(p *monv1.Prometheus, cp CollectionProfile) error { + // Our goal is to configure Prometheus to select both the resources that + // either don't have the collection profile label or have the desired value. + // However with label selectors we are not able to express OR conditions. + // Hence, the only alternative is to configure Prometheus to not select any + // resource that matches either of the collection profiles that we are not + // interested in. + profiles := make([]string, 0, len(SupportedCollectionProfiles)-1) + for _, profile := range SupportedCollectionProfiles { + if profile == cp { + continue + } + profiles = append(profiles, string(profile)) + } + + labelSelector := &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: collectionProfileLabel, + Operator: metav1.LabelSelectorOpNotIn, + Values: profiles, + }, + }, + } + + p.Spec.ServiceMonitorSelector = labelSelector + p.Spec.PodMonitorSelector = labelSelector + p.Spec.ProbeSelector = labelSelector + + return nil +} + func (f *Factory) PrometheusK8sAdditionalAlertManagerConfigsSecret() (*v1.Secret, error) { amConfigs := f.config.ClusterMonitoringConfiguration.PrometheusK8sConfig.AlertmanagerConfigs prometheusAmConfigs := PrometheusAdditionalAlertmanagerConfigs(amConfigs) @@ -1832,10 +1894,18 @@ func (f *Factory) PrometheusAdapterService() (*v1.Service, error) { return f.NewService(f.assets.MustNewAssetReader(PrometheusAdapterService)) } +func (f *Factory) PrometheusAdapterServiceMonitors() ([]*monv1.ServiceMonitor, error) { + return serviceMonitors(f.config.TechPreview, f.PrometheusAdapterServiceMonitor, f.PrometheusAdapterMinimalServiceMonitor) +} + func (f *Factory) PrometheusAdapterServiceMonitor() (*monv1.ServiceMonitor, error) { return f.NewServiceMonitor(f.assets.MustNewAssetReader(PrometheusAdapterServiceMonitor)) } +func (f *Factory) PrometheusAdapterMinimalServiceMonitor() (*monv1.ServiceMonitor, error) { + return f.NewServiceMonitor(f.assets.MustNewAssetReader(PrometheusAdapterMinimalServiceMonitor)) +} + func (f *Factory) PrometheusAdapterSecret(tlsSecret *v1.Secret, apiAuthConfigmap *v1.ConfigMap) (*v1.Secret, error) { data := make(map[string]string) @@ -2280,14 +2350,30 @@ func (f *Factory) ControlPlaneEtcdSecret(tlsClient *v1.Secret, ca *v1.ConfigMap) }, nil } +func (f *Factory) ControlPlaneEtcdServiceMonitors() ([]*monv1.ServiceMonitor, error) { + return serviceMonitors(f.config.TechPreview, f.ControlPlaneEtcdServiceMonitor, f.ControlPlaneEtcdMinimalServiceMonitor) +} + func (f *Factory) ControlPlaneEtcdServiceMonitor() (*monv1.ServiceMonitor, error) { return f.NewServiceMonitor(f.assets.MustNewAssetReader(ControlPlaneEtcdServiceMonitor)) } +func (f *Factory) ControlPlaneEtcdMinimalServiceMonitor() (*monv1.ServiceMonitor, error) { + return f.NewServiceMonitor(f.assets.MustNewAssetReader(ControlPlaneEtcdMinimalServiceMonitor)) +} + +func (f *Factory) ControlPlaneKubeletServiceMonitors() ([]*monv1.ServiceMonitor, error) { + return serviceMonitors(f.config.TechPreview, f.ControlPlaneKubeletServiceMonitor, f.ControlPlaneKubeletMinimalServiceMonitor) +} + func (f *Factory) ControlPlaneKubeletServiceMonitor() (*monv1.ServiceMonitor, error) { return f.NewServiceMonitor(f.assets.MustNewAssetReader(ControlPlaneKubeletServiceMonitor)) } +func (f *Factory) ControlPlaneKubeletMinimalServiceMonitor() (*monv1.ServiceMonitor, error) { + return f.NewServiceMonitor(f.assets.MustNewAssetReader(ControlPlaneKubeletMinimalServiceMonitor)) +} + func (f *Factory) ControlPlaneKubeletServiceMonitorPA() (*monv1.ServiceMonitor, error) { return f.NewServiceMonitor(f.assets.MustNewAssetReader(ControlPlaneKubeletServiceMonitorPA)) } @@ -3263,6 +3349,22 @@ func (f *Factory) HashSecret(secret *v1.Secret, data ...string) (*v1.Secret, err }, nil } +func serviceMonitors(appendMinimal bool, fullServiceMonitor, minimalServiceMonitor func() (*monv1.ServiceMonitor, error)) ([]*monv1.ServiceMonitor, error) { + sMonitor, err := fullServiceMonitor() + if err != nil { + return nil, err + } + sMonitorMinimal, err := minimalServiceMonitor() + if err != nil { + return nil, err + } + sms := []*monv1.ServiceMonitor{sMonitor} + if appendMinimal { + sms = append(sms, sMonitorMinimal) + } + return sms, nil +} + func addRemoteWriteConfigs(clusterID string, rw []monv1.RemoteWriteSpec, rwTargets ...RemoteWriteSpec) []monv1.RemoteWriteSpec { clusterIDRelabelConfig := []monv1.RelabelConfig{ { diff --git a/pkg/manifests/manifests_test.go b/pkg/manifests/manifests_test.go index 492896a8de..3259c995a5 100644 --- a/pkg/manifests/manifests_test.go +++ b/pkg/manifests/manifests_test.go @@ -714,7 +714,7 @@ func TestPrometheusOperatorConfiguration(t *testing.T) { image: quay.io/test/prometheus-operator prometheusConfigReloaderImage: quay.io/test/prometheus-config-reloader configReloaderImage: quay.io/test/configmap-reload -`) +`, false) c.SetImages(map[string]string{ "prometheus-operator": "docker.io/openshift/origin-prometheus-operator:latest", @@ -804,7 +804,7 @@ func TestPrometheusOperatorAdmissionWebhookConfiguration(t *testing.T) { c, err := NewConfigFromString(`prometheusOperator: nodeSelector: type: master -`) +`, false) c.SetImages(map[string]string{ "prometheus-operator-admission-webhook": "docker.io/openshift/origin-prometheus-operator-admission-webhook:latest", @@ -917,7 +917,7 @@ func TestPrometheusK8sRemoteWriteClusterIDRelabel(t *testing.T) { name: "simple remote write", config: func() *Config { - c, err := NewConfigFromString("") + c, err := NewConfigFromString("", false) if err != nil { t.Fatal(err) } @@ -944,7 +944,7 @@ func TestPrometheusK8sRemoteWriteClusterIDRelabel(t *testing.T) { name: "simple remote write with relabel config", config: func() *Config { - c, err := NewConfigFromString("") + c, err := NewConfigFromString("", false) if err != nil { t.Fatal(err) } @@ -985,7 +985,7 @@ func TestPrometheusK8sRemoteWriteClusterIDRelabel(t *testing.T) { name: "multiple remote write with relabel config", config: func() *Config { - c, err := NewConfigFromString("") + c, err := NewConfigFromString("", false) if err != nil { t.Fatal(err) } @@ -1247,7 +1247,7 @@ func TestPrometheusK8sRemoteWriteOauth2(t *testing.T) { endpointParams: param1: value1 param2: value2 -`) +`, false) if err != nil { t.Fatal(err) } @@ -1391,7 +1391,7 @@ func TestRemoteWriteAuthorizationConfig(t *testing.T) { }, } { t.Run(tc.name, func(t *testing.T) { - c, err := NewConfigFromString(tc.config) + c, err := NewConfigFromString(tc.config, false) if err != nil { t.Fatal(err) } @@ -1450,7 +1450,7 @@ func TestPrometheusK8sConfiguration(t *testing.T) { queryLogFile: /tmp/test ingress: baseAddress: monitoring-demo.staging.core-os.net -`) +`, false) if err != nil { t.Fatal(err) @@ -1706,6 +1706,61 @@ func TestPrometheusQueryLogFileConfig(t *testing.T) { }) } } +func TestPrometheusCollectionProfile(t *testing.T) { + for _, tc := range []struct { + name string + collectionProfile CollectionProfile + expectedLabelSelector *metav1.LabelSelector + }{ + { + name: "full_collection_profile", + collectionProfile: "full", + expectedLabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "monitoring.openshift.io/collection-profile", + Operator: metav1.LabelSelectorOpNotIn, + Values: []string{"minimal"}, + }, + }, + }, + }, + { + name: "minimal_collection_profile", + collectionProfile: "minimal", + expectedLabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "monitoring.openshift.io/collection-profile", + Operator: metav1.LabelSelectorOpNotIn, + Values: []string{"full"}, + }, + }, + }, + }, + } { + t.Run(tc.name, func(t *testing.T) { + c := NewDefaultConfig() + c.ClusterMonitoringConfiguration.PrometheusK8sConfig.CollectionProfile = tc.collectionProfile + f := NewFactory("openshift-monitoring", "openshift-user-workload-monitoring", c, defaultInfrastructureReader(), &fakeProxyReader{}, NewAssets(assetsPath), &APIServerConfig{}, &configv1.Console{}) + p, err := f.PrometheusK8s( + &v1.Secret{ObjectMeta: metav1.ObjectMeta{Name: "foo"}}, + &v1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Name: "foo"}}, + &v1.Secret{ObjectMeta: metav1.ObjectMeta{Name: "foo"}}, + ) + if err != nil { + t.Fatalf("Unexpected error but got %v", err) + } + + if !reflect.DeepEqual(p.Spec.ServiceMonitorSelector, tc.expectedLabelSelector) { + t.Fatalf("Label selector for service monitor is not configured correctly, got %v, expected %v", p.Spec.ServiceMonitorSelector, tc.expectedLabelSelector) + } + if !reflect.DeepEqual(p.Spec.PodMonitorSelector, tc.expectedLabelSelector) { + t.Fatalf("Label selector for pod monitor is not configured correctly, got %v, expected %v", p.Spec.PodMonitorSelector, tc.expectedLabelSelector) + } + }) + } +} func TestPrometheusRetentionConfigs(t *testing.T) { for _, tc := range []struct { @@ -1786,7 +1841,7 @@ func TestPrometheusK8sConfigurationBodySizeLimit(t *testing.T) { c, err := NewConfigFromString(` prometheusK8s: enforcedBodySizeLimit: "10MB" - `) + `, false) if err != nil { t.Fatal(err) @@ -2000,7 +2055,7 @@ func TestPrometheusK8sAdditionalAlertManagerConfigsSecret(t *testing.T) { for _, tt := range testCases { tt := tt t.Run(tt.name, func(t *testing.T) { - c, err := NewConfigFromString(tt.config) + c, err := NewConfigFromString(tt.config, false) if err != nil { t.Fatal(err) } @@ -2321,7 +2376,7 @@ func TestThanosRulerAdditionalAlertManagerConfigsSecret(t *testing.T) { for _, tt := range testCases { tt := tt t.Run(tt.name, func(t *testing.T) { - c, err := NewConfigFromString(tt.config) + c, err := NewConfigFromString(tt.config, false) if err != nil { t.Fatal(err) } @@ -2420,7 +2475,7 @@ k8sPrometheusAdapter: for _, test := range tt { t.Run(test.scenario, func(t *testing.T) { - c, err := NewConfigFromString(test.config) + c, err := NewConfigFromString(test.config, false) if err != nil { t.Logf("%s\n\n", test.config) t.Fatal(err) @@ -2476,7 +2531,7 @@ func TestK8sPrometheusAdapterConfiguration(t *testing.T) { k8sPrometheusAdapter: nodeSelector: test: value -`) +`, false) if err != nil { t.Fatal(err) } @@ -2537,7 +2592,7 @@ func TestAlertmanagerMainStartupProbe(t *testing.T) { }, } { t.Run(tc.name, func(t *testing.T) { - c, err := NewConfigFromString(tc.config) + c, err := NewConfigFromString(tc.config, false) if err != nil { t.Fatal(err) } @@ -2607,7 +2662,7 @@ func TestAlertmanagerMainConfiguration(t *testing.T) { storage: 10Gi ingress: baseAddress: monitoring-demo.staging.core-os.net -`) +`, false) if err != nil { t.Fatal(err) } @@ -2787,7 +2842,7 @@ func TestAlertManagerUserWorkloadSecretsConfiguration(t *testing.T) { } func TestNodeExporter(t *testing.T) { - c, err := NewConfigFromString(``) + c, err := NewConfigFromString(``, false) if err != nil { t.Fatal(err) } @@ -2902,7 +2957,7 @@ nodeExporter: for _, test := range tests { t.Run(test.name, func(st *testing.T) { - c, err := NewConfigFromString(test.config) + c, err := NewConfigFromString(test.config, false) if err != nil { t.Fatal(err) } @@ -2941,7 +2996,7 @@ nodeExporter: } func TestKubeStateMetrics(t *testing.T) { - c, err := NewConfigFromString(``) + c, err := NewConfigFromString(``, false) if err != nil { t.Fatal(err) } @@ -3000,7 +3055,7 @@ func TestKubeStateMetrics(t *testing.T) { } func TestOpenShiftStateMetrics(t *testing.T) { - c, err := NewConfigFromString(``) + c, err := NewConfigFromString(``, false) if err != nil { t.Fatal(err) } @@ -3117,7 +3172,7 @@ func TestThanosQuerierConfiguration(t *testing.T) { cpu: 3m memory: 4Mi logLevel: debug - enableRequestLogging: true`) + enableRequestLogging: true`, false) if err != nil { t.Fatal(err) @@ -3270,7 +3325,7 @@ grpc: } func TestTelemeterConfiguration(t *testing.T) { - c, err := NewConfigFromString(``) + c, err := NewConfigFromString(``, false) if err != nil { t.Fatal(err) } @@ -3370,7 +3425,7 @@ func TestTelemeterClientSecret(t *testing.T) { }, } { t.Run(tc.name, func(t *testing.T) { - c, err := NewConfigFromString(tc.config) + c, err := NewConfigFromString(tc.config, false) if err != nil { t.Fatal(err) } @@ -3404,7 +3459,7 @@ func TestTelemeterClientSecret(t *testing.T) { } func TestThanosRulerConfiguration(t *testing.T) { - c, err := NewConfigFromString(``) + c, err := NewConfigFromString(``, false) uwc, err := NewUserConfigFromString(`thanosRuler: topologySpreadConstraints: - maxSkew: 1 @@ -3866,7 +3921,7 @@ func TestPodDisruptionBudget(t *testing.T) { func TestPrometheusOperatorUserWorkloadConfiguration(t *testing.T) { c, err := NewConfigFromString(` enableUserWorkload: true -`) +`, false) c.SetImages(map[string]string{ "prometheus-operator": "docker.io/openshift/origin-prometheus-operator:latest", @@ -3968,7 +4023,7 @@ func TestPrometheusOperatorNodeSelector(t *testing.T) { t.Run(tc.name, func(t *testing.T) { c, err := NewConfigFromString(` enableUserWorkload: true -`) +`, false) if err != nil { t.Fatal(err) } diff --git a/pkg/manifests/types.go b/pkg/manifests/types.go index 24aff92647..1edfa59014 100644 --- a/pkg/manifests/types.go +++ b/pkg/manifests/types.go @@ -19,6 +19,16 @@ import ( v1 "k8s.io/api/core/v1" ) +type CollectionProfile string +type CollectionProfiles []CollectionProfile + +const ( + FullCollectionProfile = "full" + MinimalCollectionProfile = "minimal" +) + +var SupportedCollectionProfiles = CollectionProfiles{FullCollectionProfile, MinimalCollectionProfile} + // The `ClusterMonitoringConfiguration` resource defines settings that // customize the default platform monitoring stack through the // `cluster-monitoring-config` config map in the `openshift-monitoring` @@ -186,6 +196,13 @@ type PrometheusK8sConfig struct { Tolerations []v1.Toleration `json:"tolerations,omitempty"` // Defines the pod's topology spread constraints. TopologySpreadConstraints []v1.TopologySpreadConstraint `json:"topologySpreadConstraints,omitempty"` + // Defines the metrics collection profile that Prometheus uses to collect + // metrics from the platform components. Supported values are `full` or + // `minimal`. In the `full` profile (default), Prometheus collects all + // metrics that are exposed by the platform components. In the `minimal` + // profile, Prometheus only collects metrics necessary for the default + // platform alerts, recording rules, telemetry and console dashboards. + CollectionProfile CollectionProfile `json:"collectionProfile,omitempty"` // Defines persistent storage for Prometheus. Use this setting to // configure the persistent volume claim, including storage class, // volume size and name. diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go index 921821b843..99b5420d8d 100644 --- a/pkg/operator/operator.go +++ b/pkg/operator/operator.go @@ -838,7 +838,7 @@ func (o *Operator) loadUserWorkloadConfig(ctx context.Context) (*manifests.UserW return uwc, nil } -func (o *Operator) loadConfig(key string) (*manifests.Config, error) { +func (o *Operator) loadConfig(key string, tp bool) (*manifests.Config, error) { obj, found, err := o.cmapInf.GetStore().GetByKey(key) if err != nil { return nil, errors.Wrap(err, "an error occurred when retrieving the Cluster Monitoring ConfigMap") @@ -856,7 +856,7 @@ func (o *Operator) loadConfig(key string) (*manifests.Config, error) { return nil, errors.New("the Cluster Monitoring ConfigMap doesn't contain a 'config.yaml' key") } - cParsed, err := manifests.NewConfigFromString(configContent) + cParsed, err := manifests.NewConfigFromString(configContent, tp) if err != nil { return nil, errors.Wrap(err, "the Cluster Monitoring ConfigMap could not be parsed") } @@ -865,7 +865,12 @@ func (o *Operator) loadConfig(key string) (*manifests.Config, error) { } func (o *Operator) Config(ctx context.Context, key string) (*manifests.Config, error) { - c, err := o.loadConfig(key) + tp, err := o.client.TechPreviewEnabled(ctx) + if err != nil { + return nil, err + } + + c, err := o.loadConfig(key, tp) if err != nil { return nil, err } diff --git a/pkg/tasks/controlplane.go b/pkg/tasks/controlplane.go index 1abb54052d..67caf55113 100644 --- a/pkg/tasks/controlplane.go +++ b/pkg/tasks/controlplane.go @@ -48,14 +48,16 @@ func (t *ControlPlaneTask) Run(ctx context.Context) error { return errors.Wrap(err, "reconciling kubernetes mixin rules PrometheusRule failed") } - smk, err := t.factory.ControlPlaneKubeletServiceMonitor() + sms, err := t.factory.ControlPlaneKubeletServiceMonitors() if err != nil { - return errors.Wrap(err, "initializing control-plane kubelet ServiceMonitor failed") + return errors.Wrap(err, "initializing control-plane kubelet ServiceMonitors failed") } - err = t.client.CreateOrUpdateServiceMonitor(ctx, smk) - if err != nil { - return errors.Wrap(err, "reconciling control-plane kubelet ServiceMonitor failed") + for _, sm := range sms { + err = t.client.CreateOrUpdateServiceMonitor(ctx, sm) + if err != nil { + return errors.Wrapf(err, "reconciling %s/%s ServiceMonitor failed", sm.Namespace, sm.Name) + } } smkpa, err := t.factory.ControlPlaneKubeletServiceMonitorPA() @@ -75,16 +77,19 @@ func (t *ControlPlaneTask) Run(ctx context.Context) error { } } - sme, err := t.factory.ControlPlaneEtcdServiceMonitor() + sms, err = t.factory.ControlPlaneEtcdServiceMonitors() if err != nil { - return errors.Wrap(err, "initializing control-plane etcd ServiceMonitor failed") + return errors.Wrap(err, "initializing control-plane etcd ServiceMonitors failed") } if t.config.ClusterMonitoringConfiguration.EtcdConfig.IsEnabled() { - err = t.client.CreateOrUpdateServiceMonitor(ctx, sme) - if err != nil { - return errors.Wrap(err, "reconciling control-plane etcd ServiceMonitor failed") + for _, sm := range sms { + err = t.client.CreateOrUpdateServiceMonitor(ctx, sm) + if err != nil { + return errors.Wrapf(err, "reconciling %s/%s ServiceMonitor failed", sm.Namespace, sm.Name) + } } + etcdCA, err := t.client.WaitForConfigMapByNsName(ctx, types.NamespacedName{Namespace: "openshift-config", Name: "etcd-metric-serving-ca"}) if err != nil { return errors.Wrap(err, "failed to wait for openshift-config/etcd-metric-serving-ca configmap") @@ -105,9 +110,11 @@ func (t *ControlPlaneTask) Run(ctx context.Context) error { return errors.Wrap(err, "reconciling prometheus etcd service monitor secret") } } else { - err = t.client.DeleteServiceMonitor(ctx, sme) - if err != nil { - return errors.Wrap(err, "deleting control-plane etcd ServiceMonitor failed") + for _, sm := range sms { + err = t.client.DeleteServiceMonitor(ctx, sm) + if err != nil { + return errors.Wrapf(err, "deleting %s/%s ServiceMonitor failed", sm.Namespace, sm.Name) + } } } diff --git a/pkg/tasks/kubestatemetrics.go b/pkg/tasks/kubestatemetrics.go index c99ce15c74..dbcff4694f 100644 --- a/pkg/tasks/kubestatemetrics.go +++ b/pkg/tasks/kubestatemetrics.go @@ -104,14 +104,15 @@ func (t *KubeStateMetricsTask) Run(ctx context.Context) error { return errors.Wrap(err, "reconciling kube-state-metrics rules PrometheusRule failed") } - sm, err := t.factory.KubeStateMetricsServiceMonitor() + sms, err := t.factory.KubeStateMetricsServiceMonitors() if err != nil { - return errors.Wrap(err, "initializing kube-state-metrics ServiceMonitor failed") + return errors.Wrap(err, "initializing kube-state-metrics ServiceMonitors failed") } - - err = t.client.CreateOrUpdateServiceMonitor(ctx, sm) - if err != nil { - errors.Wrap(err, "reconciling kube-state-metrics ServiceMonitor failed") + for _, sm := range sms { + err = t.client.CreateOrUpdateServiceMonitor(ctx, sm) + if err != nil { + return errors.Wrapf(err, "reconciling %s/%s ServiceMonitor failed", sm.Namespace, sm.Name) + } } return nil diff --git a/pkg/tasks/nodeexporter.go b/pkg/tasks/nodeexporter.go index f4707a0f76..8680bbfbb2 100644 --- a/pkg/tasks/nodeexporter.go +++ b/pkg/tasks/nodeexporter.go @@ -16,6 +16,7 @@ package tasks import ( "context" + "github.com/openshift/cluster-monitoring-operator/pkg/client" "github.com/openshift/cluster-monitoring-operator/pkg/manifests" "github.com/pkg/errors" @@ -112,11 +113,17 @@ func (t *NodeExporterTask) Run(ctx context.Context) error { return errors.Wrap(err, "reconciling node-exporter rules PrometheusRule failed") } - smn, err := t.factory.NodeExporterServiceMonitor() + sms, err := t.factory.NodeExporterServiceMonitors() if err != nil { - return errors.Wrap(err, "initializing node-exporter ServiceMonitor failed") + return errors.Wrap(err, "initializing node-exporter ServiceMonitors failed") + } + + for _, sm := range sms { + err = t.client.CreateOrUpdateServiceMonitor(ctx, sm) + if err != nil { + return errors.Wrapf(err, "reconciling %s/%s ServiceMonitor failed", sm.Namespace, sm.Name) + } } - err = t.client.CreateOrUpdateServiceMonitor(ctx, smn) - return errors.Wrap(err, "reconciling node-exporter ServiceMonitor failed") + return nil } diff --git a/pkg/tasks/prometheusadapter.go b/pkg/tasks/prometheusadapter.go index 4cbf6a3ed7..832e4a982d 100644 --- a/pkg/tasks/prometheusadapter.go +++ b/pkg/tasks/prometheusadapter.go @@ -234,14 +234,16 @@ func (t *PrometheusAdapterTask) Run(ctx context.Context) error { } } { - sm, err := t.factory.PrometheusAdapterServiceMonitor() + sms, err := t.factory.PrometheusAdapterServiceMonitors() if err != nil { - return errors.Wrap(err, "initializing PrometheusAdapter ServiceMonitor failed") + return errors.Wrap(err, "initializing PrometheusAdapter ServiceMonitors failed") } - err = t.client.CreateOrUpdateServiceMonitor(ctx, sm) - if err != nil { - return errors.Wrap(err, "reconciling PrometheusAdapter ServiceMonitor failed") + for _, sm := range sms { + err = t.client.CreateOrUpdateServiceMonitor(ctx, sm) + if err != nil { + return errors.Wrapf(err, "reconciling %s/%s ServiceMonitor failed", sm.Namespace, sm.Name) + } } } {