Skip to content

Commit

Permalink
Merge pull request #309 from s-urbaniak/bz-1805116
Browse files Browse the repository at this point in the history
Bug 1805116: jsonnet/telemeter/metrics: add aggregated up metric, remove node_uname_info
  • Loading branch information
openshift-merge-robot committed Feb 20, 2020
2 parents aa79429 + ce74334 commit 227ae59
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 17 deletions.
17 changes: 11 additions & 6 deletions docs/data-collection.md
Expand Up @@ -17,11 +17,18 @@ For the OpenShift 4 Developer Preview we will be sending back these exact attrib
// All metrics under this prefix must have low (1-5) cardinality and must
// be well-scoped and follow proper naming and scoping conventions.
'{__name__=~"cluster:usage:.*"}',
// up contains information relevant to the health of the registered
// count:up0 contains the count of cluster monitoring sources being marked as down.
// This information is relevant to the health of the registered
// cluster monitoring sources on a cluster. This metric allows telemetry
// to identify when an update causes a service to begin to crash-loop or
// flake.
'{__name__="up"}',
'{__name__="count:up0"}',
// count:up1 contains the count of cluster monitoring sources being marked as up.
// This information is relevant to the health of the registered
// cluster monitoring sources on a cluster. This metric allows telemetry
// to identify when an update causes a service to begin to crash-loop or
// flake.
'{__name__="count:up1"}',
// cluster_version reports what payload and version the cluster is being
// configured to and is used to identify what versions are on a cluster
// that is experiencing problems.
Expand Down Expand Up @@ -52,8 +59,6 @@ For the OpenShift 4 Developer Preview we will be sending back these exact attrib
// cluster_feature_set reports the configured cluster feature set and
// whether the feature set is considered supported or unsupported.
'{__name__="cluster_feature_set"}',
// node_uname_info reports information about OS gathered from the uname syscall
'{__name__="node_uname_info"}',
// instance:etcd_object_counts:sum identifies two key metrics:
// - the rough size of the data stored in etcd and
// - the consistency between the etcd instances.
Expand Down Expand Up @@ -151,10 +156,10 @@ For the OpenShift 4 Developer Preview we will be sending back these exact attrib
'{__name__="noobaa_total_usage"}',
// console_url is the url of the console running on the cluster.
'{__name__="console_url"}',
// cluster:network_attachment_definition_instances:max" gives max no of instance
// cluster:network_attachment_definition_instances:max" gives max no of instance
//in the cluster that are annotated with k8s.v1.cni.cncf.io/networks, labelled by networks.
'{__name__="cluster:network_attachment_definition_instances:max"}',
// cluster:network_attachment_definition_enabled_instance_up informs (1 or 0) if the cluster has
// cluster:network_attachment_definition_enabled_instance_up informs (1 or 0) if the cluster has
//at least max of one instance with k8s.v1.cni.cncf.io/networks annotation, labelled by networks (any or sriov).
'{__name__="cluster:network_attachment_definition_enabled_instance_up:max"}',
]
Expand Down
2 changes: 1 addition & 1 deletion docs/sample-metrics.md
Expand Up @@ -13,7 +13,7 @@ return the full set of metrics that the Telemeter client captures:

[embedmd]:# (telemeter_query txt)
```txt
{__name__=~"cluster:usage:.*"} or {__name__="up"} or {__name__="cluster_version"} or {__name__="cluster_version_available_updates"} or {__name__="cluster_operator_up"} or {__name__="cluster_operator_conditions"} or {__name__="cluster_version_payload"} or {__name__="cluster_installer"} or {__name__="cluster_infrastructure_provider"} or {__name__="cluster_feature_set"} or {__name__="node_uname_info"} or {__name__="instance:etcd_object_counts:sum"} or {__name__="ALERTS",alertstate="firing"} or {__name__="code:apiserver_request_count:rate:sum"} or {__name__="cluster:capacity_cpu_cores:sum"} or {__name__="cluster:capacity_memory_bytes:sum"} or {__name__="cluster:cpu_usage_cores:sum"} or {__name__="cluster:memory_usage_bytes:sum"} or {__name__="openshift:cpu_usage_cores:sum"} or {__name__="openshift:memory_usage_bytes:sum"} or {__name__="workload:cpu_usage_cores:sum"} or {__name__="workload:memory_usage_bytes:sum"} or {__name__="cluster:virt_platform_nodes:sum"} or {__name__="cluster:node_instance_type_count:sum"} or {__name__="cnv:vmi_status_running:count"} or {__name__="node_role_os_version_machine:cpu_capacity_cores:sum"} or {__name__="node_role_os_version_machine:cpu_capacity_sockets:sum"} or {__name__="subscription_sync_total"} or {__name__="csv_succeeded"} or {__name__="csv_abnormal"} or {__name__="ceph_cluster_total_bytes"} or {__name__="ceph_cluster_total_used_raw_bytes"} or {__name__="ceph_health_status"} or {__name__="job:ceph_osd_metadata:count"} or {__name__="job:kube_pv:count"} or {__name__="job:ceph_pools_iops:total"} or {__name__="job:ceph_pools_iops_bytes:total"} or {__name__="job:ceph_versions_running:count"} or {__name__="job:noobaa_total_unhealthy_buckets:sum"} or {__name__="job:noobaa_bucket_count:sum"} or {__name__="job:noobaa_total_object_count:sum"} or {__name__="noobaa_accounts_num"} or {__name__="noobaa_total_usage"} or {__name__="console_url"} or {__name__="cluster:network_attachment_definition_instances:max"} or {__name__="cluster:network_attachment_definition_enabled_instance_up:max"}
{__name__=~"cluster:usage:.*"} or {__name__="count:up0"} or {__name__="count:up1"} or {__name__="cluster_version"} or {__name__="cluster_version_available_updates"} or {__name__="cluster_operator_up"} or {__name__="cluster_operator_conditions"} or {__name__="cluster_version_payload"} or {__name__="cluster_installer"} or {__name__="cluster_infrastructure_provider"} or {__name__="cluster_feature_set"} or {__name__="instance:etcd_object_counts:sum"} or {__name__="ALERTS",alertstate="firing"} or {__name__="code:apiserver_request_count:rate:sum"} or {__name__="cluster:capacity_cpu_cores:sum"} or {__name__="cluster:capacity_memory_bytes:sum"} or {__name__="cluster:cpu_usage_cores:sum"} or {__name__="cluster:memory_usage_bytes:sum"} or {__name__="openshift:cpu_usage_cores:sum"} or {__name__="openshift:memory_usage_bytes:sum"} or {__name__="workload:cpu_usage_cores:sum"} or {__name__="workload:memory_usage_bytes:sum"} or {__name__="cluster:virt_platform_nodes:sum"} or {__name__="cluster:node_instance_type_count:sum"} or {__name__="cnv:vmi_status_running:count"} or {__name__="node_role_os_version_machine:cpu_capacity_cores:sum"} or {__name__="node_role_os_version_machine:cpu_capacity_sockets:sum"} or {__name__="subscription_sync_total"} or {__name__="csv_succeeded"} or {__name__="csv_abnormal"} or {__name__="ceph_cluster_total_bytes"} or {__name__="ceph_cluster_total_used_raw_bytes"} or {__name__="ceph_health_status"} or {__name__="job:ceph_osd_metadata:count"} or {__name__="job:kube_pv:count"} or {__name__="job:ceph_pools_iops:total"} or {__name__="job:ceph_pools_iops_bytes:total"} or {__name__="job:ceph_versions_running:count"} or {__name__="job:noobaa_total_unhealthy_buckets:sum"} or {__name__="job:noobaa_bucket_count:sum"} or {__name__="job:noobaa_total_object_count:sum"} or {__name__="noobaa_accounts_num"} or {__name__="noobaa_total_usage"} or {__name__="console_url"} or {__name__="cluster:network_attachment_definition_instances:max"} or {__name__="cluster:network_attachment_definition_enabled_instance_up:max"}
```

For reference, here is an example response produced by a running OpenShift cluster:
Expand Down
17 changes: 11 additions & 6 deletions jsonnet/telemeter/metrics.jsonnet
Expand Up @@ -9,11 +9,18 @@
// All metrics under this prefix must have low (1-5) cardinality and must
// be well-scoped and follow proper naming and scoping conventions.
'{__name__=~"cluster:usage:.*"}',
// up contains information relevant to the health of the registered
// count:up0 contains the count of cluster monitoring sources being marked as down.
// This information is relevant to the health of the registered
// cluster monitoring sources on a cluster. This metric allows telemetry
// to identify when an update causes a service to begin to crash-loop or
// flake.
'{__name__="up"}',
'{__name__="count:up0"}',
// count:up1 contains the count of cluster monitoring sources being marked as up.
// This information is relevant to the health of the registered
// cluster monitoring sources on a cluster. This metric allows telemetry
// to identify when an update causes a service to begin to crash-loop or
// flake.
'{__name__="count:up1"}',
// cluster_version reports what payload and version the cluster is being
// configured to and is used to identify what versions are on a cluster
// that is experiencing problems.
Expand Down Expand Up @@ -44,8 +51,6 @@
// cluster_feature_set reports the configured cluster feature set and
// whether the feature set is considered supported or unsupported.
'{__name__="cluster_feature_set"}',
// node_uname_info reports information about OS gathered from the uname syscall
'{__name__="node_uname_info"}',
// instance:etcd_object_counts:sum identifies two key metrics:
// - the rough size of the data stored in etcd and
// - the consistency between the etcd instances.
Expand Down Expand Up @@ -143,10 +148,10 @@
'{__name__="noobaa_total_usage"}',
// console_url is the url of the console running on the cluster.
'{__name__="console_url"}',
// cluster:network_attachment_definition_instances:max" gives max no of instance
// cluster:network_attachment_definition_instances:max" gives max no of instance
//in the cluster that are annotated with k8s.v1.cni.cncf.io/networks, labelled by networks.
'{__name__="cluster:network_attachment_definition_instances:max"}',
// cluster:network_attachment_definition_enabled_instance_up informs (1 or 0) if the cluster has
// cluster:network_attachment_definition_enabled_instance_up informs (1 or 0) if the cluster has
//at least max of one instance with k8s.v1.cni.cncf.io/networks annotation, labelled by networks (any or sriov).
'{__name__="cluster:network_attachment_definition_enabled_instance_up:max"}',
]
4 changes: 2 additions & 2 deletions manifests/benchmark/statefulSetTelemeterServer.yaml
Expand Up @@ -26,7 +26,8 @@ spec:
- --shared-key=/etc/pki/service/tls.key
- --authorize=http://localhost:8083
- --whitelist={__name__=~"cluster:usage:.*"}
- --whitelist={__name__="up"}
- --whitelist={__name__="count:up0"}
- --whitelist={__name__="count:up1"}
- --whitelist={__name__="cluster_version"}
- --whitelist={__name__="cluster_version_available_updates"}
- --whitelist={__name__="cluster_operator_up"}
Expand All @@ -35,7 +36,6 @@ spec:
- --whitelist={__name__="cluster_installer"}
- --whitelist={__name__="cluster_infrastructure_provider"}
- --whitelist={__name__="cluster_feature_set"}
- --whitelist={__name__="node_uname_info"}
- --whitelist={__name__="instance:etcd_object_counts:sum"}
- --whitelist={__name__="alerts",alertstate="firing"}
- --whitelist={__name__="code:apiserver_request_count:rate:sum"}
Expand Down
4 changes: 2 additions & 2 deletions manifests/client/deployment.yaml
Expand Up @@ -28,7 +28,8 @@ spec:
- --anonymize-salt-file=/etc/telemeter/salt
- --anonymize-labels=$(ANONYMIZE_LABELS)
- --match={__name__=~"cluster:usage:.*"}
- --match={__name__="up"}
- --match={__name__="count:up0"}
- --match={__name__="count:up1"}
- --match={__name__="cluster_version"}
- --match={__name__="cluster_version_available_updates"}
- --match={__name__="cluster_operator_up"}
Expand All @@ -37,7 +38,6 @@ spec:
- --match={__name__="cluster_installer"}
- --match={__name__="cluster_infrastructure_provider"}
- --match={__name__="cluster_feature_set"}
- --match={__name__="node_uname_info"}
- --match={__name__="instance:etcd_object_counts:sum"}
- --match={__name__="ALERTS",alertstate="firing"}
- --match={__name__="code:apiserver_request_count:rate:sum"}
Expand Down

0 comments on commit 227ae59

Please sign in to comment.