Skip to content

Commit

Permalink
Simplify and fix issue in cluster:capacity_effective_cpu_cores
Browse files Browse the repository at this point in the history
Simplify by dividing all x86_64 cpu counts in 2.

This also addresses a bug where 0 was the effective value due to
label mismatches in `cluster:cpu_capacity_cores:_id`.

When we added a recording rule named `cluster:cpu_capacity_cores:_id`,
the recorded time series includes a tenant_id label and so it's not
equivalent to the expression it replaced.

Use of `on (_id)` clauses ignores the additional label.
  • Loading branch information
kahowell committed Feb 7, 2024
1 parent 2dd35e3 commit 78380dc
Showing 1 changed file with 8 additions and 13 deletions.
21 changes: 8 additions & 13 deletions jsonnet/telemeter/rules.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -171,23 +171,18 @@
// OpenShift Cluster effective cores for subscription usage.
// This counts both worker nodes and, when the control plane is schedulable, control plane nodes.
// Only CoreOS nodes are counted.
// 1. x86_64 nodes that show hyperthreading in the telemetry have an accurate cores value in node_role_os_version_machine:cpu_capacity_cores:sum.
// 2. x86_64 nodes that do not show hyperthreading need the cores value adjusted to account for 2 threads per core (* 0.5).
// 3. Other CPU architectures are assumed to have accurate values in node_role_os_version_machine:cpu_capacity_cores:sum.
// 1. x86_64 nodes need the cores value adjusted to account for 2 threads per core (* 0.5).
// 2. Other CPU architectures are assumed to have accurate values in cluster:capacity_cpu_cores:sum.
record: 'cluster:capacity_effective_cpu_cores',
expr: |||
# worker ht amd64
(sum by (_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_infra!="true",label_node_role_kubernetes_io_master!="true",label_kubernetes_io_arch="amd64",label_node_hyperthread_enabled="true"}) or cluster:cpu_capacity_cores:_id)+
# worker non-ht amd64
(sum by (_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_infra!="true",label_node_role_kubernetes_io_master!="true",label_kubernetes_io_arch="amd64",label_node_hyperthread_enabled="false"}) / 2.0 or cluster:cpu_capacity_cores:_id)+
# worker amd64
(sum by (_id) (cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io!="master",label_node_role_kubernetes_io!="infra",label_kubernetes_io_arch="amd64"}) / 2.0 or on (_id) cluster:cpu_capacity_cores:_id) + on (_id)
# worker non-amd64
(sum by (_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_infra!="true",label_node_role_kubernetes_io_master!="true",label_kubernetes_io_arch!="amd64"}) or cluster:cpu_capacity_cores:_id)+
# schedulable control plane ht amd64
(sum by (_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_master="true",label_kubernetes_io_arch="amd64",label_node_hyperthread_enabled="true"}) * on(_id) group by(_id) (cluster_master_schedulable == 1) or cluster:cpu_capacity_cores:_id)+
# schedulable control plane non-ht amd64
(sum by (_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_master="true",label_kubernetes_io_arch="amd64",label_node_hyperthread_enabled="false"}) * on(_id) group by(_id) (cluster_master_schedulable == 1) / 2.0 or cluster:cpu_capacity_cores:_id)+
(sum by (_id) (cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io!="master",label_node_role_kubernetes_io!="infra",label_kubernetes_io_arch!="amd64"}) or on (_id) cluster:cpu_capacity_cores:_id) + on (_id)
# schedulable control plane amd64
(sum by (_id) (cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io="master",label_kubernetes_io_arch="amd64"}) * on(_id) group by(_id) (cluster_master_schedulable == 1) / 2.0 or on (_id) cluster:cpu_capacity_cores:_id) + on (_id)
# schedulable control plane non-amd64
(sum by (_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_master="true",label_kubernetes_io_arch!="amd64"}) * on(_id) group by(_id) (cluster_master_schedulable == 1) or cluster:cpu_capacity_cores:_id)
(sum by (_id) (cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io="master",label_kubernetes_io_arch!="amd64"}) * on(_id) group by(_id) (cluster_master_schedulable == 1) or on (_id) cluster:cpu_capacity_cores:_id)
|||,
},
],
Expand Down

0 comments on commit 78380dc

Please sign in to comment.