Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RHOBS-995: Simplify cluster:capacity_effective_cpu_cores, add tests #506

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
22 changes: 20 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ UP_BIN=$(BIN_DIR)/up
OTELCOL_BIN=$(BIN_DIR)/otelcol
MEMCACHED_BIN=$(BIN_DIR)/memcached
PROMETHEUS_BIN=$(BIN_DIR)/prometheus
PROMTOOL_BIN=$(BIN_DIR)/promtool
GOJSONTOYAML_BIN=$(BIN_DIR)/gojsontoyaml
JSONNET_BIN?=$(BIN_DIR)/jsonnet
JSONNETFMT_BIN?=$(BIN_DIR)/jsonnetfmt
Expand Down Expand Up @@ -158,12 +159,27 @@ shellcheck:
###########

.PHONY: test
test: test-unit test-integration test-benchmark
test: test-unit test-rules test-integration test-benchmark

.PHONY: test-unit
test-unit:
go test -race -short $(PKGS) -count=1

tmp:
mkdir tmp

tmp/rules.yaml: $(JSONNET_LOCAL_OR_INSTALLED) jsonnet/telemeter/rules.libsonnet tmp
$(JSONNET_LOCAL_OR_INSTALLED) -e "(import 'jsonnet/telemeter/rules.libsonnet')['prometheus']['recordingrules']" > tmp/rules.yaml

.PHONY: check-rules
check-rules: $(PROMTOOL_BIN) tmp/rules.yaml
rm -f tmp/"$@".out
$(PROMTOOL_BIN) check rules tmp/rules.yaml | tee "tmp/$@.out"

.PHONY: test-rules
test-rules: check-rules
$(PROMTOOL_BIN) test rules test/rulestests.yaml | tee "tmp/$@.out"

# TODO(paulfantom): remove this target after removing it from Prow.
test-generate:
make --always-make && git diff --exit-code
Expand Down Expand Up @@ -226,7 +242,9 @@ $(MEMCACHED_BIN): | $(BIN_DIR) $(LIB_DIR)

$(PROMETHEUS_BIN): $(BIN_DIR)
@echo "Downloading Prometheus"
curl -L "https://github.com/prometheus/prometheus/releases/download/v2.3.2/prometheus-2.3.2.$$(go env GOOS)-$$(go env GOARCH).tar.gz" | tar --strip-components=1 -xzf - -C $(BIN_DIR)
curl -L "https://github.com/prometheus/prometheus/releases/download/v2.49.1/prometheus-2.49.1.$$(go env GOOS)-$$(go env GOARCH).tar.gz" | tar --strip-components=1 -xzf - -C $(BIN_DIR)

$(PROMTOOL_BIN): $(PROMETHEUS_BIN)

$(OTELCOL_BIN): $(BIN_DIR)
@echo "Downloading the OTEL collector"
Expand Down
10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,4 +83,12 @@ make test-unit

## Adding new metrics to send via telemeter

Docs on the process on why and how to send these metrics are available [here](https://docs.google.com/document/d/1a6n5iBGM2QaIQRg9Lw4-Npj6QY9--Hpx3XYut-BrUSY/edit?usp=sharing).
Docs on the process on why and how to send these metrics are available [here](https://docs.google.com/document/d/1a6n5iBGM2QaIQRg9Lw4-Npj6QY9--Hpx3XYut-BrUSY/edit?usp=sharing).

## Testing recording rule changes

Run

```bash
make test-rules
```
23 changes: 9 additions & 14 deletions jsonnet/telemeter/rules.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -164,30 +164,25 @@
// returns 0 for any cluster reporting core capacity, used to improve performance of cluster:capacity_effective_cpu_cores
record: 'cluster:cpu_capacity_cores:_id',
expr: |||
group by(_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos"}) * 0
group by(_id, tenant_id) (cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos"}) * 0
|||,
},
{
// OpenShift Cluster effective cores for subscription usage.
// This counts both worker nodes and, when the control plane is schedulable, control plane nodes.
// Only CoreOS nodes are counted.
// 1. x86_64 nodes that show hyperthreading in the telemetry have an accurate cores value in node_role_os_version_machine:cpu_capacity_cores:sum.
// 2. x86_64 nodes that do not show hyperthreading need the cores value adjusted to account for 2 threads per core (* 0.5).
// 3. Other CPU architectures are assumed to have accurate values in node_role_os_version_machine:cpu_capacity_cores:sum.
// 1. x86_64 nodes need the cores value adjusted to account for 2 threads per core (* 0.5).
// 2. Other CPU architectures are assumed to have accurate values in cluster:capacity_cpu_cores:sum.
record: 'cluster:capacity_effective_cpu_cores',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since the recording expression only uses cluster:capacity_cpu_cores:sum, shouldn't we also modify cluster:cpu_capacity_cores:_id to use the same metric? It would also simplify the test inputs since the tests would properly generate cluster:capacity_cpu_cores:_id.

expr: |||
# worker ht amd64
(sum by (_id, tenant_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_infra!="true",label_node_role_kubernetes_io_master!="true",label_kubernetes_io_arch="amd64",label_node_hyperthread_enabled="true"}) or cluster:cpu_capacity_cores:_id)+
# worker non-ht amd64
(sum by (_id, tenant_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_infra!="true",label_node_role_kubernetes_io_master!="true",label_kubernetes_io_arch="amd64",label_node_hyperthread_enabled="false"}) / 2.0 or cluster:cpu_capacity_cores:_id)+
# worker amd64
(sum by (_id, tenant_id) (cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io!="master",label_node_role_kubernetes_io!="infra",label_kubernetes_io_arch="amd64"}) / 2.0 or cluster:cpu_capacity_cores:_id) +
# worker non-amd64
(sum by (_id, tenant_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_infra!="true",label_node_role_kubernetes_io_master!="true",label_kubernetes_io_arch!="amd64"}) or cluster:cpu_capacity_cores:_id)+
# schedulable control plane ht amd64
(sum by (_id, tenant_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_master="true",label_kubernetes_io_arch="amd64",label_node_hyperthread_enabled="true"}) * on(_id, tenant_id) group by(_id, tenant_id) (cluster_master_schedulable == 1) or cluster:cpu_capacity_cores:_id)+
# schedulable control plane non-ht amd64
(sum by (_id, tenant_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_master="true",label_kubernetes_io_arch="amd64",label_node_hyperthread_enabled="false"}) * on(_id, tenant_id) group by(_id, tenant_id) (cluster_master_schedulable == 1) / 2.0 or cluster:cpu_capacity_cores:_id)+
(sum by (_id, tenant_id) (cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io!="master",label_node_role_kubernetes_io!="infra",label_kubernetes_io_arch!="amd64"}) or cluster:cpu_capacity_cores:_id) +
# schedulable control plane amd64
(sum by (_id, tenant_id) (cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io="master",label_kubernetes_io_arch="amd64"}) * on(_id, tenant_id) group by(_id, tenant_id) (cluster_master_schedulable == 1) / 2.0 or cluster:cpu_capacity_cores:_id) +
# schedulable control plane non-amd64
(sum by (_id, tenant_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_master="true",label_kubernetes_io_arch!="amd64"}) * on(_id, tenant_id) group by(_id, tenant_id) (cluster_master_schedulable == 1) or cluster:cpu_capacity_cores:_id)
(sum by (_id, tenant_id) (cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io="master",label_kubernetes_io_arch!="amd64"}) * on(_id, tenant_id) group by(_id, tenant_id) (cluster_master_schedulable == 1) or cluster:cpu_capacity_cores:_id)
|||,
},
],
Expand Down
93 changes: 93 additions & 0 deletions test/rulestests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
rule_files:
- ../tmp/rules.yaml

evaluation_interval: 1m

tests:
# cluster:capacity_effective_cpu_cores tests
- input_series:
# amd64 worker node
- series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="amd64",_id="amd64_worker",tenant_id="tenant_id"}'
values: '4'
# non-amd64 worker node
- series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="arm64",_id="nonamd64_worker",tenant_id="tenant_id"}'
values: '4'
# amd64 schedulable control plane node
- series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="amd64",label_node_role_kubernetes_io="master",_id="amd64_control_plane_schedulable",tenant_id="tenant_id"}'
values: '4'
- series: 'cluster_master_schedulable{_id="amd64_control_plane_schedulable",tenant_id="tenant_id"}'
values: '1'
# non-amd64 schedulable control plane node
- series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="arm64",label_node_role_kubernetes_io="master",_id="non-amd64_control_plane_schedulable",tenant_id="tenant_id"}'
values: '4'
- series: 'cluster_master_schedulable{_id="non-amd64_control_plane_schedulable",tenant_id="tenant_id"}'
values: '1'
# amd64 non-schedulable control plane node
- series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="amd64",label_node_role_kubernetes_io="master",_id="amd64_control_plane_non_schedulable",tenant_id="tenant_id"}'
values: '4'
- series: 'cluster_master_schedulable{_id="amd64_control_plane_non_schedulable",tenant_id="tenant_id"}'
values: '0'
# amd64 infra node
- series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="amd64",label_node_role_kubernetes_io="infra",_id="amd64_infra",tenant_id="tenant_id"}'
values: '4'
- series: 'cluster_master_schedulable{_id="amd64_infra",tenant_id="tenant_id"}'
values: '1'
# non-amd64 non-schedulable control plane node
- series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="arm64",label_node_role_kubernetes_io="master",_id="non-amd64_control_plane_non_schedulable",tenant_id="tenant_id"}'
values: '4'
- series: 'cluster_master_schedulable{_id="non-amd64_control_plane_non_schedulable",tenant_id="tenant_id"}'
values: '0'
# non-amd64 infra node
- series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="amd64",label_node_role_kubernetes_io="infra",_id="non-amd64_infra",tenant_id="tenant_id"}'
values: '4'
- series: 'cluster_master_schedulable{_id="non-amd64_infra",tenant_id="tenant_id"}'
values: '1'
promql_expr_test:
# amd64 worker (divided by 2)
- expr: cluster:capacity_effective_cpu_cores{_id="amd64_worker"}
eval_time: 0
exp_samples:
- labels: 'cluster:capacity_effective_cpu_cores{_id="amd64_worker",tenant_id="tenant_id"}'
value: 2
# non-amd64 worker (not adjusted)
- expr: cluster:capacity_effective_cpu_cores{_id="nonamd64_worker"}
eval_time: 0
exp_samples:
- labels: 'cluster:capacity_effective_cpu_cores{_id="nonamd64_worker",tenant_id="tenant_id"}'
value: 4
# amd64 schedulable control plane node (divided by 2)
- expr: cluster:capacity_effective_cpu_cores{_id="amd64_control_plane_schedulable"}
eval_time: 0
exp_samples:
- labels: 'cluster:capacity_effective_cpu_cores{_id="amd64_control_plane_schedulable",tenant_id="tenant_id"}'
value: 2
# non-amd64 schedulable control plane node (not adjusted)
- expr: cluster:capacity_effective_cpu_cores{_id="non-amd64_control_plane_schedulable"}
eval_time: 0
exp_samples:
- labels: 'cluster:capacity_effective_cpu_cores{_id="non-amd64_control_plane_schedulable",tenant_id="tenant_id"}'
value: 4
# amd64 non-schedulable control plane node excluded
- expr: cluster:capacity_effective_cpu_cores{_id="amd64_control_plane_non_schedulable"}
eval_time: 0
exp_samples:
- labels: 'cluster:capacity_effective_cpu_cores{_id="amd64_control_plane_non_schedulable",tenant_id="tenant_id"}'
value: 0
# amd64 infra excluded
- expr: cluster:capacity_effective_cpu_cores{_id="amd64_infra"}
eval_time: 0
exp_samples:
- labels: 'cluster:capacity_effective_cpu_cores{_id="amd64_infra",tenant_id="tenant_id"}'
value: 0
# non-amd64 non-schedulable control plane node excluded
- expr: cluster:capacity_effective_cpu_cores{_id="non-amd64_control_plane_non_schedulable"}
eval_time: 0
exp_samples:
- labels: 'cluster:capacity_effective_cpu_cores{_id="non-amd64_control_plane_non_schedulable",tenant_id="tenant_id"}'
value: 0
# non-amd64 infra not included
- expr: cluster:capacity_effective_cpu_cores{_id="non-amd64_infra"}
eval_time: 0
exp_samples:
- labels: 'cluster:capacity_effective_cpu_cores{_id="non-amd64_infra",tenant_id="tenant_id"}'
value: 0