openshift · openshift-merge-bot · Feb 14, 2024 · Feb 7, 2024 · Feb 12, 2024 · simonpasquier
diff --git a/Makefile b/Makefile
@@ -20,6 +20,7 @@ UP_BIN=$(BIN_DIR)/up
 OTELCOL_BIN=$(BIN_DIR)/otelcol
 MEMCACHED_BIN=$(BIN_DIR)/memcached
 PROMETHEUS_BIN=$(BIN_DIR)/prometheus
+PROMTOOL_BIN=$(BIN_DIR)/promtool
 GOJSONTOYAML_BIN=$(BIN_DIR)/gojsontoyaml
 JSONNET_BIN?=$(BIN_DIR)/jsonnet
 JSONNETFMT_BIN?=$(BIN_DIR)/jsonnetfmt
@@ -158,12 +159,27 @@ shellcheck:
 ###########
 
 .PHONY: test
-test: test-unit test-integration test-benchmark
+test: test-unit test-rules test-integration test-benchmark
 
 .PHONY: test-unit
 test-unit:
 	go test -race -short $(PKGS) -count=1
 
+tmp:
+	mkdir tmp
+
+tmp/rules.yaml: $(JSONNET_LOCAL_OR_INSTALLED) jsonnet/telemeter/rules.libsonnet tmp
+	$(JSONNET_LOCAL_OR_INSTALLED) -e "(import 'jsonnet/telemeter/rules.libsonnet')['prometheus']['recordingrules']" > tmp/rules.yaml
+
+.PHONY: check-rules
+check-rules: $(PROMTOOL_BIN) tmp/rules.yaml
+	rm -f tmp/"$@".out
+	$(PROMTOOL_BIN) check rules tmp/rules.yaml | tee "tmp/$@.out"
+
+.PHONY: test-rules
+test-rules: check-rules
+	$(PROMTOOL_BIN) test rules test/rulestests.yaml | tee "tmp/$@.out"
+
 # TODO(paulfantom): remove this target after removing it from Prow.
 test-generate:
 	make --always-make && git diff --exit-code
@@ -226,7 +242,9 @@ $(MEMCACHED_BIN): | $(BIN_DIR) $(LIB_DIR)
 
 $(PROMETHEUS_BIN): $(BIN_DIR)
 	@echo "Downloading Prometheus"
-	curl -L "https://github.com/prometheus/prometheus/releases/download/v2.3.2/prometheus-2.3.2.$$(go env GOOS)-$$(go env GOARCH).tar.gz" | tar --strip-components=1 -xzf - -C $(BIN_DIR)
+	curl -L "https://github.com/prometheus/prometheus/releases/download/v2.49.1/prometheus-2.49.1.$$(go env GOOS)-$$(go env GOARCH).tar.gz" | tar --strip-components=1 -xzf - -C $(BIN_DIR)
+
+$(PROMTOOL_BIN): $(PROMETHEUS_BIN)
 
 $(OTELCOL_BIN): $(BIN_DIR)
 	@echo "Downloading the OTEL collector"

diff --git a/README.md b/README.md
@@ -83,4 +83,12 @@ make test-unit
 
 ## Adding new metrics to send via telemeter
 
-Docs on the process on why and how to send these metrics are available [here](https://docs.google.com/document/d/1a6n5iBGM2QaIQRg9Lw4-Npj6QY9--Hpx3XYut-BrUSY/edit?usp=sharing).
+Docs on the process on why and how to send these metrics are available [here](https://docs.google.com/document/d/1a6n5iBGM2QaIQRg9Lw4-Npj6QY9--Hpx3XYut-BrUSY/edit?usp=sharing).
+
+## Testing recording rule changes
+
+Run
+
+```bash
+make test-rules
+```
diff --git a/jsonnet/telemeter/rules.libsonnet b/jsonnet/telemeter/rules.libsonnet
@@ -164,30 +164,25 @@
               // returns 0 for any cluster reporting core capacity, used to improve performance of cluster:capacity_effective_cpu_cores
               record: 'cluster:cpu_capacity_cores:_id',
               expr: |||
-                group by(_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos"}) * 0
+                group by(_id, tenant_id) (cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos"}) * 0
               |||,
             },
             {
               // OpenShift Cluster effective cores for subscription usage.
               // This counts both worker nodes and, when the control plane is schedulable, control plane nodes.
               // Only CoreOS nodes are counted.
-              // 1. x86_64 nodes that show hyperthreading in the telemetry have an accurate cores value in node_role_os_version_machine:cpu_capacity_cores:sum.
-              // 2. x86_64 nodes that do not show hyperthreading need the cores value adjusted to account for 2 threads per core (* 0.5).
-              // 3. Other CPU architectures are assumed to have accurate values in node_role_os_version_machine:cpu_capacity_cores:sum.
+              // 1. x86_64 nodes need the cores value adjusted to account for 2 threads per core (* 0.5).
+              // 2. Other CPU architectures are assumed to have accurate values in cluster:capacity_cpu_cores:sum.
               record: 'cluster:capacity_effective_cpu_cores',
               expr: |||
-                # worker ht amd64
-                (sum by (_id, tenant_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_infra!="true",label_node_role_kubernetes_io_master!="true",label_kubernetes_io_arch="amd64",label_node_hyperthread_enabled="true"}) or cluster:cpu_capacity_cores:_id)+
-                # worker non-ht amd64
-                (sum by (_id, tenant_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_infra!="true",label_node_role_kubernetes_io_master!="true",label_kubernetes_io_arch="amd64",label_node_hyperthread_enabled="false"}) / 2.0 or cluster:cpu_capacity_cores:_id)+
+                # worker amd64
+                (sum by (_id, tenant_id) (cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io!="master",label_node_role_kubernetes_io!="infra",label_kubernetes_io_arch="amd64"}) / 2.0 or cluster:cpu_capacity_cores:_id) +
                 # worker non-amd64
-                (sum by (_id, tenant_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_infra!="true",label_node_role_kubernetes_io_master!="true",label_kubernetes_io_arch!="amd64"}) or cluster:cpu_capacity_cores:_id)+
-                # schedulable control plane ht amd64
-                (sum by (_id, tenant_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_master="true",label_kubernetes_io_arch="amd64",label_node_hyperthread_enabled="true"}) * on(_id, tenant_id) group by(_id, tenant_id) (cluster_master_schedulable == 1) or cluster:cpu_capacity_cores:_id)+
-                # schedulable control plane non-ht amd64
-                (sum by (_id, tenant_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_master="true",label_kubernetes_io_arch="amd64",label_node_hyperthread_enabled="false"}) * on(_id, tenant_id) group by(_id, tenant_id) (cluster_master_schedulable == 1) / 2.0 or cluster:cpu_capacity_cores:_id)+
+                (sum by (_id, tenant_id) (cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io!="master",label_node_role_kubernetes_io!="infra",label_kubernetes_io_arch!="amd64"}) or cluster:cpu_capacity_cores:_id) +
+                # schedulable control plane amd64
+                (sum by (_id, tenant_id) (cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io="master",label_kubernetes_io_arch="amd64"}) * on(_id, tenant_id) group by(_id, tenant_id) (cluster_master_schedulable == 1) / 2.0 or cluster:cpu_capacity_cores:_id) +
                 # schedulable control plane non-amd64
-                (sum by (_id, tenant_id) (node_role_os_version_machine:cpu_capacity_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io_master="true",label_kubernetes_io_arch!="amd64"}) * on(_id, tenant_id) group by(_id, tenant_id) (cluster_master_schedulable == 1) or cluster:cpu_capacity_cores:_id)
+                (sum by (_id, tenant_id) (cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_node_role_kubernetes_io="master",label_kubernetes_io_arch!="amd64"}) * on(_id, tenant_id) group by(_id, tenant_id) (cluster_master_schedulable == 1) or cluster:cpu_capacity_cores:_id)
               |||,
             },
           ],

diff --git a/test/rulestests.yaml b/test/rulestests.yaml
@@ -0,0 +1,93 @@
+rule_files:
+    - ../tmp/rules.yaml
+
+evaluation_interval: 1m
+
+tests:
+    # cluster:capacity_effective_cpu_cores tests
+    - input_series:
+          # amd64 worker node
+          - series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="amd64",_id="amd64_worker",tenant_id="tenant_id"}'
+            values: '4'
+          # non-amd64 worker node
+          - series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="arm64",_id="nonamd64_worker",tenant_id="tenant_id"}'
+            values: '4'
+          # amd64 schedulable control plane node
+          - series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="amd64",label_node_role_kubernetes_io="master",_id="amd64_control_plane_schedulable",tenant_id="tenant_id"}'
+            values: '4'
+          - series: 'cluster_master_schedulable{_id="amd64_control_plane_schedulable",tenant_id="tenant_id"}'
+            values: '1'
+          # non-amd64 schedulable control plane node
+          - series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="arm64",label_node_role_kubernetes_io="master",_id="non-amd64_control_plane_schedulable",tenant_id="tenant_id"}'
+            values: '4'
+          - series: 'cluster_master_schedulable{_id="non-amd64_control_plane_schedulable",tenant_id="tenant_id"}'
+            values: '1'
+          # amd64 non-schedulable control plane node
+          - series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="amd64",label_node_role_kubernetes_io="master",_id="amd64_control_plane_non_schedulable",tenant_id="tenant_id"}'
+            values: '4'
+          - series: 'cluster_master_schedulable{_id="amd64_control_plane_non_schedulable",tenant_id="tenant_id"}'
+            values: '0'
+          # amd64 infra node
+          - series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="amd64",label_node_role_kubernetes_io="infra",_id="amd64_infra",tenant_id="tenant_id"}'
+            values: '4'
+          - series: 'cluster_master_schedulable{_id="amd64_infra",tenant_id="tenant_id"}'
+            values: '1'
+          # non-amd64 non-schedulable control plane node
+          - series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="arm64",label_node_role_kubernetes_io="master",_id="non-amd64_control_plane_non_schedulable",tenant_id="tenant_id"}'
+            values: '4'
+          - series: 'cluster_master_schedulable{_id="non-amd64_control_plane_non_schedulable",tenant_id="tenant_id"}'
+            values: '0'
+          # non-amd64 infra node
+          - series: 'cluster:capacity_cpu_cores:sum{label_node_openshift_io_os_id="rhcos",label_kubernetes_io_arch="amd64",label_node_role_kubernetes_io="infra",_id="non-amd64_infra",tenant_id="tenant_id"}'
+            values: '4'
+          - series: 'cluster_master_schedulable{_id="non-amd64_infra",tenant_id="tenant_id"}'
+            values: '1'
+      promql_expr_test:
+          # amd64 worker (divided by 2)
+          - expr: cluster:capacity_effective_cpu_cores{_id="amd64_worker"}
+            eval_time: 0
+            exp_samples:
+                - labels: 'cluster:capacity_effective_cpu_cores{_id="amd64_worker",tenant_id="tenant_id"}'
+                  value: 2
+          # non-amd64 worker (not adjusted)
+          - expr: cluster:capacity_effective_cpu_cores{_id="nonamd64_worker"}
+            eval_time: 0
+            exp_samples:
+                - labels: 'cluster:capacity_effective_cpu_cores{_id="nonamd64_worker",tenant_id="tenant_id"}'
+                  value: 4
+          # amd64 schedulable control plane node (divided by 2)
+          - expr: cluster:capacity_effective_cpu_cores{_id="amd64_control_plane_schedulable"}
+            eval_time: 0
+            exp_samples:
+                - labels: 'cluster:capacity_effective_cpu_cores{_id="amd64_control_plane_schedulable",tenant_id="tenant_id"}'
+                  value: 2
+          # non-amd64 schedulable control plane node (not adjusted)
+          - expr: cluster:capacity_effective_cpu_cores{_id="non-amd64_control_plane_schedulable"}
+            eval_time: 0
+            exp_samples:
+                - labels: 'cluster:capacity_effective_cpu_cores{_id="non-amd64_control_plane_schedulable",tenant_id="tenant_id"}'
+                  value: 4
+          # amd64 non-schedulable control plane node excluded
+          - expr: cluster:capacity_effective_cpu_cores{_id="amd64_control_plane_non_schedulable"}
+            eval_time: 0
+            exp_samples:
+                - labels: 'cluster:capacity_effective_cpu_cores{_id="amd64_control_plane_non_schedulable",tenant_id="tenant_id"}'
+                  value: 0
+          # amd64 infra excluded
+          - expr: cluster:capacity_effective_cpu_cores{_id="amd64_infra"}
+            eval_time: 0
+            exp_samples:
+                - labels: 'cluster:capacity_effective_cpu_cores{_id="amd64_infra",tenant_id="tenant_id"}'
+                  value: 0
+          # non-amd64 non-schedulable control plane node excluded
+          - expr: cluster:capacity_effective_cpu_cores{_id="non-amd64_control_plane_non_schedulable"}
+            eval_time: 0
+            exp_samples:
+                - labels: 'cluster:capacity_effective_cpu_cores{_id="non-amd64_control_plane_non_schedulable",tenant_id="tenant_id"}'
+                  value: 0
+          # non-amd64 infra not included
+          - expr: cluster:capacity_effective_cpu_cores{_id="non-amd64_infra"}
+            eval_time: 0
+            exp_samples:
+                - labels: 'cluster:capacity_effective_cpu_cores{_id="non-amd64_infra",tenant_id="tenant_id"}'
+                  value: 0