Skip to content

Commit

Permalink
RHOBS-995: Simplify cluster:capacity_effective_cpu_cores, add tests
Browse files Browse the repository at this point in the history
Simplify by dividing all x86_64 cpu counts by 2.

Note that this takes advantage of the way that the SKUs are structured,
where the capacity is written as multiples of "2 cores or 4vCPUs".

One difference in how this simplification works is that with nodes
reporting more than 2 threads-per-core will be counted by CPUs, rather
than by cores.

When exactly 2 threads-per-core are reported, there is no functional
difference, as node_role_os_version_machine:cpu_capacity_cores:sum
already divides CPUs by 2.

This adds testing similar to what's in cluster-monitoring-operator,
covering only the `cluster:capacity_effective_cpu_cores` rule.

I had to update the prometheus version, as promtool was too old and
incorrectly flagging existing rules.

I added a note about rule tests to the README.

I did not update the prow config, because I don't know where to, but
happy to do an update for that given some hints.
  • Loading branch information
kahowell committed Feb 8, 2024
1 parent 23b2e4b commit d851af5
Show file tree
Hide file tree
Showing 4 changed files with 147 additions and 17 deletions.
24 changes: 21 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@ UP_BIN=$(BIN_DIR)/up
OTELCOL_BIN=$(BIN_DIR)/otelcol
MEMCACHED_BIN=$(BIN_DIR)/memcached
PROMETHEUS_BIN=$(BIN_DIR)/prometheus
PROMTOOL_BIN=$(BIN_DIR)/promtool
GOJSONTOYAML_BIN=$(BIN_DIR)/gojsontoyaml
JSONNET_BIN?=$(BIN_DIR)/jsonnet
JSONNETFMT_BIN?=$(BIN_DIR)/jsonnetfmt
SONNETFMT_BIN?=$(BIN_DIR)/jsonnetfmt
# We need jsonnet on CI; here we default to the user's installed jsonnet binary; if nothing is installed, then install go-jsonnet.
JSONNET_LOCAL_OR_INSTALLED=$(if $(shell which jsonnet 2>/dev/null),$(shell which jsonnet 2>/dev/null),$(JSONNET_BIN))
JSONNETFMT_LOCAL_OR_INSTALLED=$(if $(shell which jsonnetfmt 2>/dev/null),$(shell which jsonnetfmt 2>/dev/null),$(JSONNETFMT_BIN))
Expand Down Expand Up @@ -158,12 +159,27 @@ shellcheck:
###########

.PHONY: test
test: test-unit test-integration test-benchmark
test: test-unit test-rules test-integration test-benchmark

.PHONY: test-unit
test-unit:
go test -race -short $(PKGS) -count=1

tmp:
mkdir tmp

tmp/rules.yaml: $(JSONNET_LOCAL_OR_INSTALLED) jsonnet/telemeter/rules.libsonnet tmp
$(JSONNET_LOCAL_OR_INSTALLED) -e "(import 'jsonnet/telemeter/rules.libsonnet')['prometheus']['recordingrules']" > tmp/rules.yaml

.PHONY: check-rules
check-rules: $(PROMTOOL_BIN) tmp/rules.yaml
rm -f tmp/"$@".out
$(PROMTOOL_BIN) check rules tmp/rules.yaml | tee "tmp/$@.out"

.PHONY: test-rules
test-rules: check-rules
$(PROMTOOL_BIN) test rules test/rulestests.yaml | tee "tmp/$@.out"

# TODO(paulfantom): remove this target after removing it from Prow.
test-generate:
make --always-make && git diff --exit-code
Expand Down Expand Up @@ -226,7 +242,9 @@ $(MEMCACHED_BIN): | $(BIN_DIR) $(LIB_DIR)

$(PROMETHEUS_BIN): $(BIN_DIR)
@echo "Downloading Prometheus"
curl -L "https://github.com/prometheus/prometheus/releases/download/v2.3.2/prometheus-2.3.2.$$(go env GOOS)-$$(go env GOARCH).tar.gz" | tar --strip-components=1 -xzf - -C $(BIN_DIR)
curl -L "https://github.com/prometheus/prometheus/releases/download/v2.49.1/prometheus-2.49.1.$$(go env GOOS)-$$(go env GOARCH).tar.gz" | tar --strip-components=1 -xzf - -C $(BIN_DIR)

$(PROMTOOL_BIN): $(PROMETHEUS_BIN)

$(OTELCOL_BIN): $(BIN_DIR)
@echo "Downloading the OTEL collector"
Expand Down
10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,4 +83,12 @@ make test-unit

## Adding new metrics to send via telemeter

Docs on the process on why and how to send these metrics are available [here](https://docs.google.com/document/d/1a6n5iBGM2QaIQRg9Lw4-Npj6QY9--Hpx3XYut-BrUSY/edit?usp=sharing).
Docs on the process on why and how to send these metrics are available [here](https://docs.google.com/document/d/1a6n5iBGM2QaIQRg9Lw4-Npj6QY9--Hpx3XYut-BrUSY/edit?usp=sharing).

## Testing recording rule changes

Run

```bash
make test-rules
```
21 changes: 8 additions & 13 deletions jsonnet/telemeter/rules.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -171,23 +171,18 @@
// OpenShift Cluster effective cores for subscription usage.
// This counts both worker nodes and, when the control plane is schedulable, control plane nodes.
// Only CoreOS nodes are counted.
// 1. x86_64 nodes that show hyperthreading in the telemetry have an accurate cores value in node_role_os_version_machine:cpu_capacity_cores:sum.
// 2. x86_64 nodes that do not show hyperthreading need the cores value adjusted to account for 2 threads per core (* 0.5).
// 3. Other CPU architectures are assumed to have accurate values in node_role_os_version_machine:cpu_capacity_cores:sum.
// 1. x86_64 nodes need the cores value adjusted to account for 2 threads per core (* 0.5).
// 2. Other CPU architectures are assumed to have accurate values in cluster:capacity_cpu_cores:sum.
record: 'cluster:capacity_effective_cpu_cores',
expr: |||
# worker ht amd64
(sum by (_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_infra!="true",label_node_role_kubernetes_io_master!="true",label_kubernetes_io_arch="amd64",label_node_hyperthread_enabled="true"}) or cluster:cpu_capacity_cores:_id)+
# worker non-ht amd64
(sum by (_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_infra!="true",label_node_role_kubernetes_io_master!="true",label_kubernetes_io_arch="amd64",label_node_hyperthread_enabled="false"}) / 2.0 or cluster:cpu_capacity_cores:_id)+
# worker amd64
(sum by (_id, tenant_id) (cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io!="master",label_node_role_kubernetes_io!="infra",label_kubernetes_io_arch="amd64"}) / 2.0 or cluster:cpu_capacity_cores:_id) +
# worker non-amd64
(sum by (_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_infra!="true",label_node_role_kubernetes_io_master!="true",label_kubernetes_io_arch!="amd64"}) or cluster:cpu_capacity_cores:_id)+
# schedulable control plane ht amd64
(sum by (_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_master="true",label_kubernetes_io_arch="amd64",label_node_hyperthread_enabled="true"}) * on(_id) group by(_id) (cluster_master_schedulable == 1) or cluster:cpu_capacity_cores:_id)+
# schedulable control plane non-ht amd64
(sum by (_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_master="true",label_kubernetes_io_arch="amd64",label_node_hyperthread_enabled="false"}) * on(_id) group by(_id) (cluster_master_schedulable == 1) / 2.0 or cluster:cpu_capacity_cores:_id)+
(sum by (_id, tenant_id) (cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io!="master",label_node_role_kubernetes_io!="infra",label_kubernetes_io_arch!="amd64"}) or cluster:cpu_capacity_cores:_id) +
# schedulable control plane amd64
(sum by (_id, tenant_id) (cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io="master",label_kubernetes_io_arch="amd64"}) * on(_id, tenant_id) group by(_id, tenant_id) (cluster_master_schedulable == 1) / 2.0 or cluster:cpu_capacity_cores:_id) +
# schedulable control plane non-amd64
(sum by (_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_master="true",label_kubernetes_io_arch!="amd64"}) * on(_id) group by(_id) (cluster_master_schedulable == 1) or cluster:cpu_capacity_cores:_id)
(sum by (_id, tenant_id) (cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io="master",label_kubernetes_io_arch!="amd64"}) * on(_id, tenant_id) group by(_id, tenant_id) (cluster_master_schedulable == 1) or cluster:cpu_capacity_cores:_id)
|||,
},
],
Expand Down
109 changes: 109 additions & 0 deletions test/rulestests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
rule_files:
- ../tmp/rules.yaml

evaluation_interval: 1m

tests:
# cluster:capacity_effective_cpu_cores tests
- input_series:
# amd64 worker node
- series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="amd64",_id="amd64_worker",tenant_id="tenant_id"}'
values: '4'
- series: 'cluster:cpu_capacity_cores:_id{_id="amd64_worker",tenant_id="tenant_id"}'
values: '0'
# non-amd64 worker node
- series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="arm64",_id="nonamd64_worker",tenant_id="tenant_id"}'
values: '4'
- series: 'cluster:cpu_capacity_cores:_id{_id="nonamd64_worker",tenant_id="tenant_id"}'
values: '0'
# amd64 schedulable control plane node
- series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="amd64",label_node_role_kubernetes_io="master",_id="amd64_control_plane_schedulable",tenant_id="tenant_id"}'
values: '4'
- series: 'cluster:cpu_capacity_cores:_id{_id="amd64_control_plane_schedulable",tenant_id="tenant_id"}'
values: '0'
- series: 'cluster_master_schedulable{_id="amd64_control_plane_schedulable",tenant_id="tenant_id"}'
values: '1'
# non-amd64 schedulable control plane node
- series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="arm64",label_node_role_kubernetes_io="master",_id="non-amd64_control_plane_schedulable",tenant_id="tenant_id"}'
values: '4'
- series: 'cluster:cpu_capacity_cores:_id{_id="non-amd64_control_plane_schedulable",tenant_id="tenant_id"}'
values: '0'
- series: 'cluster_master_schedulable{_id="non-amd64_control_plane_schedulable",tenant_id="tenant_id"}'
values: '1'
# amd64 non-schedulable control plane node
- series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="amd64",label_node_role_kubernetes_io="master",_id="amd64_control_plane_non_schedulable",tenant_id="tenant_id"}'
values: '4'
- series: 'cluster:cpu_capacity_cores:_id{_id="amd64_control_plane_non_schedulable",tenant_id="tenant_id"}'
values: '0'
- series: 'cluster_master_schedulable{_id="amd64_control_plane_non_schedulable",tenant_id="tenant_id"}'
values: '0'
# amd64 infra node
- series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="amd64",label_node_role_kubernetes_io="infra",_id="amd64_infra",tenant_id="tenant_id"}'
values: '4'
- series: 'cluster:cpu_capacity_cores:_id{_id="amd64_infra",tenant_id="tenant_id"}'
values: '0'
- series: 'cluster_master_schedulable{_id="amd64_infra",tenant_id="tenant_id"}'
values: '1'
# non-amd64 non-schedulable control plane node
- series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="arm64",label_node_role_kubernetes_io="master",_id="non-amd64_control_plane_non_schedulable",tenant_id="tenant_id"}'
values: '4'
- series: 'cluster:cpu_capacity_cores:_id{_id="non-amd64_control_plane_non_schedulable",tenant_id="tenant_id"}'
values: '0'
- series: 'cluster_master_schedulable{_id="non-amd64_control_plane_non_schedulable",tenant_id="tenant_id"}'
values: '0'
# non-amd64 infra node
- series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="amd64",label_node_role_kubernetes_io="infra",_id="non-amd64_infra",tenant_id="tenant_id"}'
values: '4'
- series: 'cluster:cpu_capacity_cores:_id{_id="non-amd64_infra",tenant_id="tenant_id"}'
values: '0'
- series: 'cluster_master_schedulable{_id="non-amd64_infra",tenant_id="tenant_id"}'
values: '1'
promql_expr_test:
# amd64 worker (divided by 2)
- expr: cluster:capacity_effective_cpu_cores{_id="amd64_worker"}
eval_time: 0
exp_samples:
- labels: 'cluster:capacity_effective_cpu_cores{_id="amd64_worker",tenant_id="tenant_id"}'
value: 2
# non-amd64 worker (not adjusted)
- expr: cluster:capacity_effective_cpu_cores{_id="nonamd64_worker"}
eval_time: 0
exp_samples:
- labels: 'cluster:capacity_effective_cpu_cores{_id="nonamd64_worker",tenant_id="tenant_id"}'
value: 4
# amd64 schedulable control plane node (divided by 2)
- expr: cluster:capacity_effective_cpu_cores{_id="amd64_control_plane_schedulable"}
eval_time: 0
exp_samples:
- labels: 'cluster:capacity_effective_cpu_cores{_id="amd64_control_plane_schedulable",tenant_id="tenant_id"}'
value: 2
# non-amd64 schedulable control plane node (not adjusted)
- expr: cluster:capacity_effective_cpu_cores{_id="non-amd64_control_plane_schedulable"}
eval_time: 0
exp_samples:
- labels: 'cluster:capacity_effective_cpu_cores{_id="non-amd64_control_plane_schedulable",tenant_id="tenant_id"}'
value: 4
# amd64 non-schedulable control plane node excluded
- expr: cluster:capacity_effective_cpu_cores{_id="amd64_control_plane_non_schedulable"}
eval_time: 0
exp_samples:
- labels: 'cluster:capacity_effective_cpu_cores{_id="amd64_control_plane_non_schedulable",tenant_id="tenant_id"}'
value: 0
# amd64 infra excluded
- expr: cluster:capacity_effective_cpu_cores{_id="amd64_infra"}
eval_time: 0
exp_samples:
- labels: 'cluster:capacity_effective_cpu_cores{_id="amd64_infra",tenant_id="tenant_id"}'
value: 0
# non-amd64 non-schedulable control plane node excluded
- expr: cluster:capacity_effective_cpu_cores{_id="non-amd64_control_plane_non_schedulable"}
eval_time: 0
exp_samples:
- labels: 'cluster:capacity_effective_cpu_cores{_id="non-amd64_control_plane_non_schedulable",tenant_id="tenant_id"}'
value: 0
# non-amd64 infra not included
- expr: cluster:capacity_effective_cpu_cores{_id="non-amd64_infra"}
eval_time: 0
exp_samples:
- labels: 'cluster:capacity_effective_cpu_cores{_id="non-amd64_infra",tenant_id="tenant_id"}'
value: 0

0 comments on commit d851af5

Please sign in to comment.