Merge pull request #309 from s-urbaniak/bz-1805116

Bug 1805116: jsonnet/telemeter/metrics: add aggregated up metric, remove node_uname_info
openshift · Feb 20, 2020 · 227ae59 · 227ae59
2 parents aa79429 + ce74334
commit 227ae59
Show file tree

Hide file tree

Showing 5 changed files with 27 additions and 17 deletions.
diff --git a/docs/data-collection.md b/docs/data-collection.md
@@ -17,11 +17,18 @@ For the OpenShift 4 Developer Preview we will be sending back these exact attrib
   // All metrics under this prefix must have low (1-5) cardinality and must
   // be well-scoped and follow proper naming and scoping conventions.
   '{__name__=~"cluster:usage:.*"}',
-  // up contains information relevant to the health of the registered
+  // count:up0 contains the count of cluster monitoring sources being marked as down.
+  // This information is relevant to the health of the registered
   // cluster monitoring sources on a cluster. This metric allows telemetry
   // to identify when an update causes a service to begin to crash-loop or
   // flake.
-  '{__name__="up"}',
+  '{__name__="count:up0"}',
+  // count:up1 contains the count of cluster monitoring sources being marked as up.
+  // This information is relevant to the health of the registered
+  // cluster monitoring sources on a cluster. This metric allows telemetry
+  // to identify when an update causes a service to begin to crash-loop or
+  // flake.
+  '{__name__="count:up1"}',
   // cluster_version reports what payload and version the cluster is being
   // configured to and is used to identify what versions are on a cluster
   // that is experiencing problems.
@@ -52,8 +59,6 @@ For the OpenShift 4 Developer Preview we will be sending back these exact attrib
   // cluster_feature_set reports the configured cluster feature set and
   // whether the feature set is considered supported or unsupported.
   '{__name__="cluster_feature_set"}',
-  // node_uname_info reports information about OS gathered from the uname syscall
-  '{__name__="node_uname_info"}',
   // instance:etcd_object_counts:sum identifies two key metrics:
   // - the rough size of the data stored in etcd and
   // - the consistency between the etcd instances.
@@ -151,10 +156,10 @@ For the OpenShift 4 Developer Preview we will be sending back these exact attrib
   '{__name__="noobaa_total_usage"}',
   // console_url is the url of the console running on the cluster.
   '{__name__="console_url"}',
-  // cluster:network_attachment_definition_instances:max" gives max no of instance 
+  // cluster:network_attachment_definition_instances:max" gives max no of instance
   //in the cluster that are annotated with k8s.v1.cni.cncf.io/networks, labelled by networks.
   '{__name__="cluster:network_attachment_definition_instances:max"}',
-  // cluster:network_attachment_definition_enabled_instance_up  informs (1 or 0) if the cluster has 
+  // cluster:network_attachment_definition_enabled_instance_up  informs (1 or 0) if the cluster has
   //at least max of one instance with  k8s.v1.cni.cncf.io/networks annotation, labelled by networks (any or sriov).
   '{__name__="cluster:network_attachment_definition_enabled_instance_up:max"}',
 ]

diff --git a/docs/sample-metrics.md b/docs/sample-metrics.md
@@ -13,7 +13,7 @@ return the full set of metrics that the Telemeter client captures:
 
 [embedmd]:# (telemeter_query txt)
 ```txt
-{__name__=~"cluster:usage:.*"} or {__name__="up"} or {__name__="cluster_version"} or {__name__="cluster_version_available_updates"} or {__name__="cluster_operator_up"} or {__name__="cluster_operator_conditions"} or {__name__="cluster_version_payload"} or {__name__="cluster_installer"} or {__name__="cluster_infrastructure_provider"} or {__name__="cluster_feature_set"} or {__name__="node_uname_info"} or {__name__="instance:etcd_object_counts:sum"} or {__name__="ALERTS",alertstate="firing"} or {__name__="code:apiserver_request_count:rate:sum"} or {__name__="cluster:capacity_cpu_cores:sum"} or {__name__="cluster:capacity_memory_bytes:sum"} or {__name__="cluster:cpu_usage_cores:sum"} or {__name__="cluster:memory_usage_bytes:sum"} or {__name__="openshift:cpu_usage_cores:sum"} or {__name__="openshift:memory_usage_bytes:sum"} or {__name__="workload:cpu_usage_cores:sum"} or {__name__="workload:memory_usage_bytes:sum"} or {__name__="cluster:virt_platform_nodes:sum"} or {__name__="cluster:node_instance_type_count:sum"} or {__name__="cnv:vmi_status_running:count"} or {__name__="node_role_os_version_machine:cpu_capacity_cores:sum"} or {__name__="node_role_os_version_machine:cpu_capacity_sockets:sum"} or {__name__="subscription_sync_total"} or {__name__="csv_succeeded"} or {__name__="csv_abnormal"} or {__name__="ceph_cluster_total_bytes"} or {__name__="ceph_cluster_total_used_raw_bytes"} or {__name__="ceph_health_status"} or {__name__="job:ceph_osd_metadata:count"} or {__name__="job:kube_pv:count"} or {__name__="job:ceph_pools_iops:total"} or {__name__="job:ceph_pools_iops_bytes:total"} or {__name__="job:ceph_versions_running:count"} or {__name__="job:noobaa_total_unhealthy_buckets:sum"} or {__name__="job:noobaa_bucket_count:sum"} or {__name__="job:noobaa_total_object_count:sum"} or {__name__="noobaa_accounts_num"} or {__name__="noobaa_total_usage"} or {__name__="console_url"} or {__name__="cluster:network_attachment_definition_instances:max"} or {__name__="cluster:network_attachment_definition_enabled_instance_up:max"}
+{__name__=~"cluster:usage:.*"} or {__name__="count:up0"} or {__name__="count:up1"} or {__name__="cluster_version"} or {__name__="cluster_version_available_updates"} or {__name__="cluster_operator_up"} or {__name__="cluster_operator_conditions"} or {__name__="cluster_version_payload"} or {__name__="cluster_installer"} or {__name__="cluster_infrastructure_provider"} or {__name__="cluster_feature_set"} or {__name__="instance:etcd_object_counts:sum"} or {__name__="ALERTS",alertstate="firing"} or {__name__="code:apiserver_request_count:rate:sum"} or {__name__="cluster:capacity_cpu_cores:sum"} or {__name__="cluster:capacity_memory_bytes:sum"} or {__name__="cluster:cpu_usage_cores:sum"} or {__name__="cluster:memory_usage_bytes:sum"} or {__name__="openshift:cpu_usage_cores:sum"} or {__name__="openshift:memory_usage_bytes:sum"} or {__name__="workload:cpu_usage_cores:sum"} or {__name__="workload:memory_usage_bytes:sum"} or {__name__="cluster:virt_platform_nodes:sum"} or {__name__="cluster:node_instance_type_count:sum"} or {__name__="cnv:vmi_status_running:count"} or {__name__="node_role_os_version_machine:cpu_capacity_cores:sum"} or {__name__="node_role_os_version_machine:cpu_capacity_sockets:sum"} or {__name__="subscription_sync_total"} or {__name__="csv_succeeded"} or {__name__="csv_abnormal"} or {__name__="ceph_cluster_total_bytes"} or {__name__="ceph_cluster_total_used_raw_bytes"} or {__name__="ceph_health_status"} or {__name__="job:ceph_osd_metadata:count"} or {__name__="job:kube_pv:count"} or {__name__="job:ceph_pools_iops:total"} or {__name__="job:ceph_pools_iops_bytes:total"} or {__name__="job:ceph_versions_running:count"} or {__name__="job:noobaa_total_unhealthy_buckets:sum"} or {__name__="job:noobaa_bucket_count:sum"} or {__name__="job:noobaa_total_object_count:sum"} or {__name__="noobaa_accounts_num"} or {__name__="noobaa_total_usage"} or {__name__="console_url"} or {__name__="cluster:network_attachment_definition_instances:max"} or {__name__="cluster:network_attachment_definition_enabled_instance_up:max"}
 ```
 
 For reference, here is an example response produced by a running OpenShift cluster:

diff --git a/jsonnet/telemeter/metrics.jsonnet b/jsonnet/telemeter/metrics.jsonnet
@@ -9,11 +9,18 @@
   // All metrics under this prefix must have low (1-5) cardinality and must
   // be well-scoped and follow proper naming and scoping conventions.
   '{__name__=~"cluster:usage:.*"}',
-  // up contains information relevant to the health of the registered
+  // count:up0 contains the count of cluster monitoring sources being marked as down.
+  // This information is relevant to the health of the registered
   // cluster monitoring sources on a cluster. This metric allows telemetry
   // to identify when an update causes a service to begin to crash-loop or
   // flake.
-  '{__name__="up"}',
+  '{__name__="count:up0"}',
+  // count:up1 contains the count of cluster monitoring sources being marked as up.
+  // This information is relevant to the health of the registered
+  // cluster monitoring sources on a cluster. This metric allows telemetry
+  // to identify when an update causes a service to begin to crash-loop or
+  // flake.
+  '{__name__="count:up1"}',
   // cluster_version reports what payload and version the cluster is being
   // configured to and is used to identify what versions are on a cluster
   // that is experiencing problems.
@@ -44,8 +51,6 @@
   // cluster_feature_set reports the configured cluster feature set and
   // whether the feature set is considered supported or unsupported.
   '{__name__="cluster_feature_set"}',
-  // node_uname_info reports information about OS gathered from the uname syscall
-  '{__name__="node_uname_info"}',
   // instance:etcd_object_counts:sum identifies two key metrics:
   // - the rough size of the data stored in etcd and
   // - the consistency between the etcd instances.
@@ -143,10 +148,10 @@
   '{__name__="noobaa_total_usage"}',
   // console_url is the url of the console running on the cluster.
   '{__name__="console_url"}',
-  // cluster:network_attachment_definition_instances:max" gives max no of instance 
+  // cluster:network_attachment_definition_instances:max" gives max no of instance
   //in the cluster that are annotated with k8s.v1.cni.cncf.io/networks, labelled by networks.
   '{__name__="cluster:network_attachment_definition_instances:max"}',
-  // cluster:network_attachment_definition_enabled_instance_up  informs (1 or 0) if the cluster has 
+  // cluster:network_attachment_definition_enabled_instance_up  informs (1 or 0) if the cluster has
   //at least max of one instance with  k8s.v1.cni.cncf.io/networks annotation, labelled by networks (any or sriov).
   '{__name__="cluster:network_attachment_definition_enabled_instance_up:max"}',
 ]
diff --git a/manifests/benchmark/statefulSetTelemeterServer.yaml b/manifests/benchmark/statefulSetTelemeterServer.yaml
@@ -26,7 +26,8 @@ spec:
         - --shared-key=/etc/pki/service/tls.key
         - --authorize=http://localhost:8083
         - --whitelist={__name__=~"cluster:usage:.*"}
-        - --whitelist={__name__="up"}
+        - --whitelist={__name__="count:up0"}
+        - --whitelist={__name__="count:up1"}
         - --whitelist={__name__="cluster_version"}
         - --whitelist={__name__="cluster_version_available_updates"}
         - --whitelist={__name__="cluster_operator_up"}
@@ -35,7 +36,6 @@ spec:
         - --whitelist={__name__="cluster_installer"}
         - --whitelist={__name__="cluster_infrastructure_provider"}
         - --whitelist={__name__="cluster_feature_set"}
-        - --whitelist={__name__="node_uname_info"}
         - --whitelist={__name__="instance:etcd_object_counts:sum"}
         - --whitelist={__name__="alerts",alertstate="firing"}
         - --whitelist={__name__="code:apiserver_request_count:rate:sum"}

diff --git a/manifests/client/deployment.yaml b/manifests/client/deployment.yaml
@@ -28,7 +28,8 @@ spec:
         - --anonymize-salt-file=/etc/telemeter/salt
         - --anonymize-labels=$(ANONYMIZE_LABELS)
         - --match={__name__=~"cluster:usage:.*"}
-        - --match={__name__="up"}
+        - --match={__name__="count:up0"}
+        - --match={__name__="count:up1"}
         - --match={__name__="cluster_version"}
         - --match={__name__="cluster_version_available_updates"}
         - --match={__name__="cluster_operator_up"}
@@ -37,7 +38,6 @@ spec:
         - --match={__name__="cluster_installer"}
         - --match={__name__="cluster_infrastructure_provider"}
         - --match={__name__="cluster_feature_set"}
-        - --match={__name__="node_uname_info"}
         - --match={__name__="instance:etcd_object_counts:sum"}
         - --match={__name__="ALERTS",alertstate="firing"}
         - --match={__name__="code:apiserver_request_count:rate:sum"}