Skip to content

Commit

Permalink
remove unused rules
Browse files Browse the repository at this point in the history
  • Loading branch information
raptorsun committed Aug 10, 2021
1 parent c188b47 commit 1f6a0ba
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 170 deletions.
128 changes: 0 additions & 128 deletions assets/cluster-monitoring-operator/prometheus-rule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,56 +63,6 @@ spec:
severity: warning
- name: openshift-kubernetes.rules
rules:
- expr: sum(container_memory_usage_bytes{container="",pod!=""}) BY (pod, namespace)
record: pod:container_memory_usage_bytes:sum
- expr: sum(container_spec_cpu_shares{container="",pod!=""}) BY (pod, namespace)
record: pod:container_spec_cpu_shares:sum
- expr: sum(rate(container_cpu_usage_seconds_total{container="",pod!=""}[5m]))
BY (pod, namespace)
record: pod:container_cpu_usage:sum
- expr: sum(container_fs_usage_bytes{pod!=""}) BY (pod, namespace)
record: pod:container_fs_usage_bytes:sum
- expr: sum(container_memory_usage_bytes{container!=""}) BY (namespace)
record: namespace:container_memory_usage_bytes:sum
- expr: sum(container_spec_cpu_shares{container!=""}) BY (namespace)
record: namespace:container_spec_cpu_shares:sum
- expr: sum(rate(container_cpu_usage_seconds_total{container!="POD",container!=""}[5m]))
BY (namespace)
record: namespace:container_cpu_usage:sum
- expr: sum(container_memory_usage_bytes{container="",pod!=""}) BY (cluster) /
sum(machine_memory_bytes) BY (cluster)
record: cluster:memory_usage:ratio
- expr: sum(container_spec_cpu_shares{container="",pod!=""}) / 1000 / sum(machine_cpu_cores)
record: cluster:container_spec_cpu_shares:ratio
- expr: sum(rate(container_cpu_usage_seconds_total{container="",pod!=""}[5m]))
/ sum(machine_cpu_cores)
record: cluster:container_cpu_usage:ratio
- expr: max without(endpoint, instance, job, pod, service) (kube_node_labels and
on(node) kube_node_role{role="master"})
labels:
label_node_role_kubernetes_io: master
label_node_role_kubernetes_io_master: "true"
record: cluster:master_nodes
- expr: max without(endpoint, instance, job, pod, service) (kube_node_labels and
on(node) kube_node_role{role="infra"})
labels:
label_node_role_kubernetes_io_infra: "true"
record: cluster:infra_nodes
- expr: max without(endpoint, instance, job, pod, service) (cluster:master_nodes
and on(node) cluster:infra_nodes)
labels:
label_node_role_kubernetes_io_infra: "true"
label_node_role_kubernetes_io_master: "true"
record: cluster:master_infra_nodes
- expr: cluster:master_infra_nodes or on (node) cluster:master_nodes or on (node)
cluster:infra_nodes or on (node) max without(endpoint, instance, job, pod,
service) (kube_node_labels)
record: cluster:nodes_roles
- expr: kube_node_labels and on(node) (sum(label_replace(node_cpu_info, "node",
"$1", "instance", "(.*)")) by (node, package, core) == 2)
labels:
label_node_hyperthread_enabled: "true"
record: cluster:hyperthread_enabled_nodes
- expr: count(sum(virt_platform) by (instance, type, system_manufacturer, system_product_name,
baseboard_manufacturer, baseboard_product_name)) by (type, system_manufacturer,
system_product_name, baseboard_manufacturer, baseboard_product_name)
Expand All @@ -137,44 +87,12 @@ spec:
)
)
record: cluster:capacity_cpu_cores:sum
- expr: |
clamp_max(
label_replace(
sum by(instance, package, core) (
node_cpu_info{core!="",package!=""}
or
# Assume core = cpu and package = 0 for platforms that don't expose core/package labels.
label_replace(label_join(node_cpu_info{core="",package=""}, "core", "", "cpu"), "package", "0", "package", "")
) > 1,
"label_node_hyperthread_enabled",
"true",
"instance",
"(.*)"
) or on (instance, package)
label_replace(
sum by(instance, package, core) (
label_replace(node_cpu_info{core!="",package!=""}
or
# Assume core = cpu and package = 0 for platforms that don't expose core/package labels.
label_join(node_cpu_info{core="",package=""}, "core", "", "cpu"), "package", "0", "package", "")
) <= 1,
"label_node_hyperthread_enabled",
"false",
"instance",
"(.*)"
),
1
)
record: cluster:cpu_core_hyperthreading
- expr: |
topk by(node) (1, cluster:nodes_roles) * on (node)
group_right( label_beta_kubernetes_io_instance_type, label_node_role_kubernetes_io, label_node_openshift_io_os_id, label_kubernetes_io_arch,
label_node_role_kubernetes_io_master, label_node_role_kubernetes_io_infra)
label_replace( cluster:cpu_core_hyperthreading, "node", "$1", "instance", "(.*)" )
record: cluster:cpu_core_node_labels
- expr: count(cluster:cpu_core_node_labels) by (label_beta_kubernetes_io_instance_type,
label_node_hyperthread_enabled)
record: cluster:capacity_cpu_cores_hyperthread_enabled:sum
- expr: |
sum by(label_beta_kubernetes_io_instance_type, label_node_role_kubernetes_io)
(
Expand Down Expand Up @@ -224,15 +142,6 @@ spec:
)
)
record: cluster:kube_persistentvolumeclaim_resource_requests_storage_bytes:provisioner:sum
- expr: (sum(node_role_os_version_machine:cpu_capacity_cores:sum{label_node_role_kubernetes_io_master="",label_node_role_kubernetes_io_infra=""}
or absent(__does_not_exist__)*0)) + ((sum(node_role_os_version_machine:cpu_capacity_cores:sum{label_node_role_kubernetes_io_master="true"}
or absent(__does_not_exist__)*0) * ((max(cluster_master_schedulable == 1)*0+1)
or (absent(cluster_master_schedulable == 1)*0))))
record: workload:capacity_physical_cpu_cores:sum
- expr: min_over_time(workload:capacity_physical_cpu_cores:sum[5m:15s])
record: cluster:usage:workload:capacity_physical_cpu_cores:min:5m
- expr: max_over_time(workload:capacity_physical_cpu_cores:sum[5m:15s])
record: cluster:usage:workload:capacity_physical_cpu_cores:max:5m
- expr: |
sum by (provisioner) (
topk by (namespace, persistentvolumeclaim) (
Expand All @@ -245,20 +154,9 @@ spec:
record: cluster:kubelet_volume_stats_used_bytes:provisioner:sum
- expr: sum(etcd_object_counts) BY (instance)
record: instance:etcd_object_counts:sum
- expr: topk(500, max(etcd_object_counts) by (resource))
record: cluster:usage:resources:sum
- expr: count(count (kube_pod_restart_policy{type!="Always",namespace!~"openshift-.+"})
by (namespace,pod))
record: cluster:usage:pods:terminal:workload:sum
- expr: sum(max(kubelet_containers_per_pod_count_sum) by (instance))
record: cluster:usage:containers:sum
- expr: count(cluster:cpu_core_node_labels) by (label_kubernetes_io_arch, label_node_hyperthread_enabled,
label_node_openshift_io_os_id,label_node_role_kubernetes_io_master,label_node_role_kubernetes_io_infra)
record: node_role_os_version_machine:cpu_capacity_cores:sum
- expr: count(max(cluster:cpu_core_node_labels) by (node, package, label_beta_kubernetes_io_instance_type,
label_node_hyperthread_enabled, label_node_role_kubernetes_io) ) by ( label_beta_kubernetes_io_instance_type,
label_node_hyperthread_enabled, label_node_role_kubernetes_io)
record: cluster:capacity_cpu_sockets_hyperthread_enabled:sum
- expr: count (max(cluster:cpu_core_node_labels) by (node, package, label_kubernetes_io_arch,
label_node_hyperthread_enabled, label_node_openshift_io_os_id,label_node_role_kubernetes_io_master,label_node_role_kubernetes_io_infra)
) by (label_kubernetes_io_arch, label_node_hyperthread_enabled, label_node_openshift_io_os_id,label_node_role_kubernetes_io_master,label_node_role_kubernetes_io_infra)
Expand All @@ -273,13 +171,6 @@ spec:
record: cluster:vsphere_esxi_version_total:sum
- expr: sum by(hw_version)(vsphere_node_hw_version_total)
record: cluster:vsphere_node_hw_version_total:sum
- expr: |
sum(
min by (node) (kube_node_status_condition{condition="Ready",status="true"})
and
max by (node) (kube_node_role{role="master"})
) == bool sum(kube_node_role{role="master"})
record: cluster:control_plane:all_nodes_ready
- alert: ClusterMonitoringOperatorReconciliationErrors
annotations:
message: Cluster Monitoring Operator is experiencing unexpected reconciliation
Expand Down Expand Up @@ -334,23 +225,6 @@ spec:
for: 15m
labels:
severity: info
- expr: avg_over_time((((count((max by (node) (up{job="kubelet",metrics_path="/metrics"}
== 1) and max by (node) (kube_node_status_condition{condition="Ready",status="true"}
== 1) and min by (node) (kube_node_spec_unschedulable == 0))) / scalar(count(min
by (node) (kube_node_spec_unschedulable == 0))))))[5m:1s])
record: cluster:usage:kube_schedulable_node_ready_reachable:avg5m
- expr: avg_over_time((count(max by (node) (kube_node_status_condition{condition="Ready",status="true"}
== 1)) / scalar(count(max by (node) (kube_node_status_condition{condition="Ready",status="true"}))))[5m:1s])
record: cluster:usage:kube_node_ready:avg5m
- expr: (max without (condition,container,endpoint,instance,job,service) (((kube_pod_status_ready{condition="false"}
== 1)*0 or (kube_pod_status_ready{condition="true"} == 1)) * on(pod,namespace)
group_left() group by (pod,namespace) (kube_pod_status_phase{phase=~"Running|Unknown|Pending"}
== 1)))
record: kube_running_pod_ready
- expr: avg(kube_running_pod_ready{namespace=~"openshift-.*"})
record: cluster:usage:openshift:kube_running_pod_ready:avg
- expr: avg(kube_running_pod_ready{namespace!~"openshift-.*"})
record: cluster:usage:workload:kube_running_pod_ready:avg
- interval: 30s
name: kubernetes-recurring.rules
rules:
Expand Down Expand Up @@ -384,8 +258,6 @@ spec:
record: cluster:usage:openshift:ingress_request_total:irate5m
- name: openshift-build.rules
rules:
- expr: sum(openshift_build_total{job="kubernetes-apiservers",phase="Error"})/(sum(openshift_build_total{job="kubernetes-apiservers",phase=~"Failed|Complete|Error"}))
record: build_error_rate
- expr: sum by (strategy) (openshift_build_status_phase_total)
record: openshift:build_by_strategy:sum
- name: openshift-monitoring.rules
Expand Down
42 changes: 0 additions & 42 deletions assets/control-plane/prometheus-rule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -525,35 +525,6 @@ spec:
for: 15m
labels:
severity: critical
- name: kube-apiserver-histogram.rules
rules:
- expr: |
histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0
labels:
quantile: "0.99"
verb: read
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0
labels:
quantile: "0.99"
verb: write
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
labels:
quantile: "0.99"
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
labels:
quantile: "0.9"
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
labels:
quantile: "0.5"
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- name: k8s.rules
rules:
- expr: |
Expand Down Expand Up @@ -741,19 +712,6 @@ spec:
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
- name: node.rules
rules:
- expr: |
topk by(namespace, pod) (1,
max by (node, namespace, pod) (
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
))
record: 'node_namespace_pod:kube_pod_info:'
- expr: |
count by (cluster, node) (sum by (node, cpu) (
node_cpu_seconds_total{job="node-exporter"}
* on (namespace, pod) group_left(node)
topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:)
))
record: node:node_num_cpu:sum
- expr: |
sum(
node_memory_MemAvailable_bytes{job="node-exporter"} or
Expand Down
89 changes: 89 additions & 0 deletions jsonnet/patch-rules.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ local excludedRuleGroups = [
'kube-apiserver-slos',
'kube-apiserver.rules',
'kube-apiserver-burnrate.rules',
'kube-apiserver-histogram.rules',
];

local excludedRules = [
Expand All @@ -17,6 +18,12 @@ local excludedRules = [
{ alert: 'TargetDown' },
],
},
{
name: 'kubelet.rules',
rules: [
{ alert: 'node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile' },
],
},
{
name: 'kubernetes-system',
rules: [
Expand Down Expand Up @@ -59,6 +66,88 @@ local excludedRules = [
{ alert: 'KubeDeploymentReplicasMismatch' },
],
},
{
name: 'kube-prometheus-node-recording.rules',
rules: [
{ alert: 'instance:node_cpu:rate:sum' },
{ alert: 'instance:node_network_receive_bytes:rate:sum' },
{ alert: 'instance:node_cpu:ratio' },
{ alert: 'instance:node_network_transmit_bytes:rate:sum' },
],
},
{
name: 'node.rules',
rules: [
{ record: 'node:node_num_cpu:sum' },
{ record: 'node_namespace_pod:kube_pod_info:' },
{ record: 'node_namespace_pod:kube_pod_info:' },
],
},
{
name: 'openshift-build.rules',
rules: [
{ record: 'build_error_rate' },
],
},
{
name: 'openshift-kubernetes.rules',
rules: [
{ record: 'cluster:capacity_cpu_cores_hyperthread_enabled:sum' },
{ record: 'cluster:capacity_cpu_sockets_hyperthread_enabled:sum' },
{ record: 'cluster:container_cpu_usage:ratio' },
{ record: 'cluster:container_spec_cpu_shares:ratio' },
{ record: 'cluster:control_plane:all_nodes_ready' },
{ record: 'cluster:cpu_core_hyperthreading' },
{ record: 'cluster:hyperthread_enabled_nodes' },
{ record: 'cluster:infra_nodes' },
{ record: 'cluster:master_infra_nodes' },
{ record: 'cluster:master_nodes' },
{ record: 'cluster:memory_usage:ratio' },
{ record: 'cluster:node_cpu:ratio' },
{ record: 'cluster:node_cpu:sum_rate5m' },
{ record: 'cluster:nodes_roles' },
{ record: 'cluster:usage:containers:sum' },
{ record: 'cluster:usage:ingress_frontend_bytes_in:rate5m:sum' },
{ record: 'cluster:usage:ingress_frontend_bytes_out:rate5m:sum' },
{ record: 'cluster:usage:ingress_frontend_connections:sum' },
{ record: 'cluster:usage:kube_node_ready:avg5m' },
{ record: 'cluster:usage:kube_schedulable_node_ready_reachable:avg5m' },
{ record: 'cluster:usage:openshift:ingress_request_error:fraction5m' },
{ record: 'cluster:usage:openshift:ingress_request_total:irate5m' },
{ record: 'cluster:usage:openshift:kube_running_pod_ready:avg' },
{ record: 'cluster:usage:pods:terminal:workload:sum' },
{ record: 'cluster:usage:resources:sum' },
{ record: 'cluster:usage:workload:capacity_physical_cpu_core_seconds' },
{ record: 'cluster:usage:workload:capacity_physical_cpu_cores:max:5m' },
{ record: 'cluster:usage:workload:capacity_physical_cpu_cores:min:5m' },
{ record: 'cluster:usage:workload:ingress_request_error:fraction5m' },
{ record: 'cluster:usage:workload:ingress_request_total:irate5m' },
{ record: 'cluster:usage:workload:kube_running_pod_ready:avg' },
{ record: 'kube_running_pod_ready' },
{ record: 'namespace:container_cpu_usage:sum' },
{ record: 'namespace:container_memory_usage_bytes:sum' },
{ record: 'namespace:container_spec_cpu_shares:sum' },
{ record: 'pod:container_cpu_usage:sum' },
{ record: 'pod:container_fs_usage_bytes:sum' },
{ record: 'pod:container_memory_usage_bytes:sum' },
{ record: 'pod:container_spec_cpu_shares:sum' },
{ record: 'workload:capacity_physical_cpu_cores:sum' },
],
},
{
name: 'openshift-ingress.rules',
rules: [
{ alert: 'code:cluster:ingress_http_request_count:rate5m:sum' },
{ alert: 'code:registry_api_request_count:rate:sum' },
],
},
{
name: 'openshift-sre.rules',
rules: [
{ alert: 'kube_pod_status_ready:etcd:sum' },
{ alert: 'kube_pod_status_ready:image_registry:sum' },
],
},
{
name: 'thanos-query',
rules: [
Expand Down

0 comments on commit 1f6a0ba

Please sign in to comment.