From 1d6ee8a2a5fb45766d6f3f8f3ed9660e60169069 Mon Sep 17 00:00:00 2001 From: Sebastian Jug Date: Tue, 24 Feb 2026 09:26:18 -0500 Subject: [PATCH 1/4] ready.sh: Add timestamps and not ready pod tracking --- ansible/roles/common/tasks/files/ready.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/ansible/roles/common/tasks/files/ready.sh b/ansible/roles/common/tasks/files/ready.sh index 70c73294c3..ff8db1d24c 100755 --- a/ansible/roles/common/tasks/files/ready.sh +++ b/ansible/roles/common/tasks/files/ready.sh @@ -123,7 +123,12 @@ wait_for_ready() { # Print progress when pod count changes if [[ ${ready} -ne ${prev_ready} ]]; then - echo "${label}: ${ready}/${expected} ready" + elapsed=$(( $(date +%s) - START_TIME )) + echo "[${elapsed}s] ${label}: ${ready}/${expected} ready" + # Show which pods are not yet ready + oc get pods -A -o json \ + | jq -r '.items[] | select(.status.conditions[]? | select(.type=="Ready" and .status!="True")) | " NOT READY: \(.metadata.namespace)/\(.metadata.name)"' \ + 2>/dev/null || true prev_ready=${ready} fi @@ -154,4 +159,5 @@ READY_SECONDS_ALL="" wait_for_ready "Non-storage pods" "${EXPECTED_PODS}" count_ready_nostorage READY_SECONDS_NON_STORAGE wait_for_ready "All pods" "${ALL_PODS}" count_ready_all READY_SECONDS_ALL -echo "{\"ready_seconds_non_storage\":${READY_SECONDS_NON_STORAGE},\"ready_seconds_all\":${READY_SECONDS_ALL}}" +END_TIME=$(date +%s) +echo "{\"ready_seconds_non_storage\":${READY_SECONDS_NON_STORAGE},\"ready_seconds_all\":${READY_SECONDS_ALL},\"start_epoch\":${START_TIME},\"end_epoch\":${END_TIME}}" From d946858cdbee9cc5ba8c1d4db8972e1910f13704 Mon Sep 17 00:00:00 2001 From: Sebastian Jug Date: Tue, 24 Feb 2026 09:27:16 -0500 Subject: [PATCH 2/4] install-microshift: Clean baseline before initial disk measurement --- .../roles/install-microshift/tasks/main.yml | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/ansible/roles/install-microshift/tasks/main.yml b/ansible/roles/install-microshift/tasks/main.yml index bedbe2e218..cf34521892 100644 --- a/ansible/roles/install-microshift/tasks/main.yml +++ b/ansible/roles/install-microshift/tasks/main.yml @@ -1,6 +1,44 @@ --- # install-microshift tasks +- name: check if microshift is installed + ansible.builtin.command: rpm -q microshift + register: microshift_installed + ignore_errors: true + changed_when: false + +- name: cleanup and uninstall existing microshift + block: + - name: stop microshift service + become: yes + ansible.builtin.systemd: + name: microshift + state: stopped + ignore_errors: true + + - name: find microshift cleanup script + ansible.builtin.find: + paths: /usr/bin + patterns: + - '.*microshift.*cleanup.*' + - '.*cleanup.*microshift.*' + use_regex: yes + register: find_cleanup + + - name: cleanup microshift data + become: yes + ansible.builtin.shell: echo 1 | {{ find_cleanup.files[0].path }} --all + when: find_cleanup.files | length > 0 + + - name: uninstall microshift packages and dependencies + become: yes + ansible.builtin.dnf: + name: + - microshift* + autoremove: yes + state: absent + when: microshift_installed.rc == 0 + - name: record initial disk space vars: filename: disk0.txt From 000caeb97c0d04c3acaa7f9be8ef81e54b629494 Mon Sep 17 00:00:00 2001 From: Sebastian Jug Date: Tue, 24 Feb 2026 09:28:13 -0500 Subject: [PATCH 3/4] install-logging-exporters: Fix node exporter package and service names --- ansible/roles/install-logging-exporters/defaults/main.yml | 2 +- ansible/roles/install-logging-exporters/tasks/main.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/install-logging-exporters/defaults/main.yml b/ansible/roles/install-logging-exporters/defaults/main.yml index 8644dfb8e6..a6f6325d19 100644 --- a/ansible/roles/install-logging-exporters/defaults/main.yml +++ b/ansible/roles/install-logging-exporters/defaults/main.yml @@ -6,4 +6,4 @@ process_exporter_url: https://github.com/ncabatoff/process-exporter/releases/dow prometheus_services: - process-exporter - - prometheus-node-exporter + - node_exporter diff --git a/ansible/roles/install-logging-exporters/tasks/main.yml b/ansible/roles/install-logging-exporters/tasks/main.yml index 55d9b6b5b4..78f42daf8c 100644 --- a/ansible/roles/install-logging-exporters/tasks/main.yml +++ b/ansible/roles/install-logging-exporters/tasks/main.yml @@ -1,7 +1,7 @@ - name: install node-exporter ansible.builtin.dnf: name: - - golang-github-prometheus-node-exporter + - node-exporter state: present - name: install process-exporter From 47823b54260dfa9acd55040aa413a3863dfa10cb Mon Sep 17 00:00:00 2001 From: Sebastian Jug Date: Tue, 24 Feb 2026 09:32:02 -0500 Subject: [PATCH 4/4] microshift-start: Add Prometheus network and image size metrics --- ansible/roles/common/tasks/files/crio-df.py | 347 +++++++++++++++ .../common/tasks/files/prom-network-query.py | 395 ++++++++++++++++++ .../roles/microshift-start/defaults/main.yml | 2 - ansible/roles/microshift-start/tasks/main.yml | 104 +++-- 4 files changed, 809 insertions(+), 39 deletions(-) create mode 100644 ansible/roles/common/tasks/files/crio-df.py create mode 100644 ansible/roles/common/tasks/files/prom-network-query.py diff --git a/ansible/roles/common/tasks/files/crio-df.py b/ansible/roles/common/tasks/files/crio-df.py new file mode 100644 index 0000000000..01d89fed93 --- /dev/null +++ b/ansible/roles/common/tasks/files/crio-df.py @@ -0,0 +1,347 @@ +#!/usr/bin/env python3 +""" +CRI-O Storage Disk Usage Reporter +A replacement for 'podman system df -v' that correctly calculates SharedSize for CRI-O storage. +Also calculates compressed (download) sizes from locally stored registry manifests. +""" + +import json +import os +import sys +from collections import defaultdict +from datetime import datetime, timezone + +# Configuration +STORAGE_ROOT = "/var/lib/containers/storage" +IMAGES_DIR = f"{STORAGE_ROOT}/overlay-images" +IMAGES_JSON = f"{IMAGES_DIR}/images.json" +LAYERS_JSON = f"{STORAGE_ROOT}/overlay-layers/layers.json" +VOLATILE_LAYERS_JSON = f"{STORAGE_ROOT}/overlay-layers/volatile-layers.json" + + +def load_json_file(filepath): + """Load and parse a JSON file.""" + try: + with open(filepath, 'r') as f: + return json.load(f) + except (FileNotFoundError, json.JSONDecodeError): + return [] + except Exception as e: + print(f"Warning: Could not load {filepath}: {e}", file=sys.stderr) + return [] + + +def format_size(bytes_value): + """Format bytes as human-readable string using decimal (SI) units to match podman.""" + if bytes_value == 0: + return "0B" + + units = ['B', 'KB', 'MB', 'GB', 'TB'] + unit_index = 0 + size = float(bytes_value) + + # Use 1000 (decimal) instead of 1024 (binary) to match podman's output + while size >= 1000 and unit_index < len(units) - 1: + size /= 1000 + unit_index += 1 + + # Format with appropriate precision + if unit_index == 0: # Bytes + return f"{int(size)}{units[unit_index]}" + elif size >= 100: + return f"{size:.0f}{units[unit_index]}" + elif size >= 10: + return f"{size:.1f}{units[unit_index]}" + else: + return f"{size:.2f}{units[unit_index]}" + + +def format_time_ago(timestamp): + """Format timestamp as 'X days/weeks/months ago'.""" + if not timestamp: + return "Unknown" + + try: + # Parse the timestamp + if isinstance(timestamp, str): + # Handle ISO format with timezone + created = datetime.fromisoformat(timestamp.replace('Z', '+00:00')) + else: + created = datetime.fromtimestamp(timestamp, tz=timezone.utc) + + now = datetime.now(timezone.utc) + diff = now - created + + days = diff.days + if days == 0: + hours = diff.seconds // 3600 + if hours == 0: + return "Just now" + return f"{hours} hour{'s' if hours > 1 else ''} ago" + elif days == 1: + return "Yesterday" + elif days < 7: + return f"{days} days ago" + elif days < 30: + weeks = days // 7 + return f"{weeks} week{'s' if weeks > 1 else ''} ago" + elif days < 365: + months = days // 30 + return f"{months} month{'s' if months > 1 else ''} ago" + else: + years = days // 365 + return f"{years} year{'s' if years > 1 else ''} ago" + except Exception: + return "Unknown" + + +def walk_image_layers(image, layers_by_id): + """Walk the layer chain from TopLayer up through parents.""" + layers_walked = [] + visited = set() + + # Start from TopLayer + layer_id = image.get("layer") + + # Walk up the parent chain + while layer_id and layer_id not in visited: + visited.add(layer_id) + + layer = layers_by_id.get(layer_id) + if not layer: + break + + layers_walked.append(layer_id) + layer_id = layer.get("parent") + + return layers_walked + + +def get_image_display_name(image): + """Get the best display name for an image.""" + names = image.get("names", []) + if not names: + return ("", "") + + # Use the first name + name = names[0] + + # Remove digest if present + if "@sha256:" in name: + name = name.split("@")[0] + + # Split into repository and tag + if ":" in name: + parts = name.rsplit(":", 1) + # If the right side still contains '/', this ':' belongs to a registry + # host:port segment and the image is untagged. + if "/" in parts[1]: + return (name, "latest") + return (parts[0], parts[1]) + else: + return (name, "latest") + + +def get_compressed_sizes(images): + """Read registry manifests from local storage to get compressed layer sizes.""" + all_layers = {} # digest -> compressed size + layer_usage = defaultdict(int) + per_image = [] + + for image in images: + image_id = image["id"] + manifest_path = os.path.join(IMAGES_DIR, image_id, "manifest") + if not os.path.exists(manifest_path): + continue + + manifest = load_json_file(manifest_path) + if not manifest: + continue + + layers = manifest.get("layers", []) + image_compressed = 0 + image_layers = [] + + for layer in layers: + digest = layer.get("digest", "") + size = layer.get("size", 0) + image_compressed += size + image_layers.append({"digest": digest, "size": size}) + all_layers[digest] = size + layer_usage[digest] += 1 + + repo, _ = get_image_display_name(image) + per_image.append({ + "id": image_id[:12], + "name": repo, + "compressed": image_compressed, + "layers": image_layers, + }) + + shared_digests = {d for d, c in layer_usage.items() if c > 1} + return all_layers, layer_usage, shared_digests, per_image + + +def main(verbose=False): + """Main function to display CRI-O storage disk usage.""" + + # Check if running as root + if os.geteuid() != 0: + print("Error: This script must be run as root (sudo)", file=sys.stderr) + sys.exit(1) + + # Load data + images = load_json_file(IMAGES_JSON) + layers = load_json_file(LAYERS_JSON) + volatile_layers = load_json_file(VOLATILE_LAYERS_JSON) + + if not images: + print("No images found in CRI-O storage") + return + + # Create layer lookup map + layers_by_id = {layer["id"]: layer for layer in layers} + + # Count layer usage across all images + layer_count = defaultdict(int) + image_layers = {} + + for image in images: + image_id = image["id"] + walked_layers = walk_image_layers(image, layers_by_id) + image_layers[image_id] = walked_layers + + for layer_id in walked_layers: + layer_count[layer_id] += 1 + + # Calculate total storage first (each layer counted exactly once) + total_size = 0 + for layer_id in layer_count.keys(): + layer = layers_by_id[layer_id] + diff_size = layer.get("diff-size") + size = diff_size if diff_size is not None else (layer.get("uncompress_size", 0) or 0) + total_size += size + + # Map image top layers to image IDs for container counting + layer_to_image = {} + for img in images: + layer_to_image[img.get("layer")] = img["id"] + + # Count containers per image using volatile layers (matching podman's method) + containers_per_image = defaultdict(int) + for container in volatile_layers: + parent = container.get("parent") + if parent in layer_to_image: + image_id = layer_to_image[parent] + containers_per_image[image_id] += 1 + + # Calculate sizes for each image + image_data = [] + total_reclaimable = 0 + + for image in images: + image_id = image["id"] + walked_layers = image_layers[image_id] + + shared_size = 0 + unique_size = 0 + + for layer_id in walked_layers: + layer = layers_by_id[layer_id] + diff_size = layer.get("diff-size") + size = diff_size if diff_size is not None else (layer.get("uncompress_size", 0) or 0) + + if layer_count[layer_id] > 1: + shared_size += size + else: + unique_size += size + + image_size = shared_size + unique_size + + # Count containers using this image (from volatile layers) + container_count = containers_per_image.get(image_id, 0) + + # Get image metadata + repo, tag = get_image_display_name(image) + created = format_time_ago(image.get("created")) + + image_data.append({ + "id": image_id[:12], + "repository": repo, + "tag": tag, + "created": created, + "size": image_size, + "shared_size": shared_size, + "unique_size": unique_size, + "containers": container_count, + "reclaimable": unique_size if container_count == 0 else 0 + }) + + if container_count == 0: + total_reclaimable += unique_size + + # Calculate reclaimable percentage + reclaimable_pct = (total_reclaimable / total_size * 100) if total_size > 0 else 0 + + # Calculate compressed download sizes + comp_layers, _, comp_shared, comp_per_image = get_compressed_sizes(images) + comp_deduplicated = sum(comp_layers.values()) + + if verbose: + # Detailed output similar to 'podman system df -v' + print("Images space usage:\n") + print(f"{'REPOSITORY':<55} {'TAG':<12} {'IMAGE ID':<12} {'CREATED':<10} {'SIZE':<12} {'SHARED SIZE':<12} {'UNIQUE SIZE':<12} {'CONTAINERS'}") + + for img in sorted(image_data, key=lambda x: x["size"], reverse=True): + print(f"{img['repository']:<55} {img['tag']:<12} {img['id']:<12} {img['created']:<10} " + f"{format_size(img['size']):<12} {format_size(img['shared_size']):<12} " + f"{format_size(img['unique_size']):<12} {img['containers']}") + + print("\nContainers space usage:\n") + print(f"{'CONTAINER ID':<12} {'IMAGE':<35} {'COMMAND':<20} {'LOCAL VOLUMES':<15} {'SIZE':<12} {'CREATED':<12} {'STATUS':<12} {'NAMES'}") + + # Container layers from volatile-layers.json + for container in volatile_layers[:10]: # Limit to first 10 + container_id = container.get("id", "")[:12] + parent = container.get("parent", "") + image = layer_to_image.get(parent, "")[:12] if parent in layer_to_image else "N/A" + print(f"{container_id:<12} {image:<35} {'N/A':<20} {'0':<15} {'N/A':<12} {'N/A':<12} {'N/A':<12} {'N/A'}") + + print("\nLocal Volumes space usage:\n") + print(f"{'VOLUME NAME':<30} {'LINKS':<10} {'SIZE'}") + # No volume info in CRI-O context + + # Compressed download sizes + print("\nCompressed download sizes:\n") + for img in sorted(comp_per_image, key=lambda x: x["compressed"], reverse=True): + print(f"{img['id']} {format_size(img['compressed']):>10} {img['name']}") + for layer in img["layers"]: + marker = " *" if layer["digest"] in comp_shared else "" + print(f" {layer['digest'][:19]}... {format_size(layer['size']):>10}{marker}") + + else: + # Summary output similar to 'podman system df' + print(f"{'TYPE':<15} {'TOTAL':<12} {'ACTIVE':<12} {'SIZE':<15} {'RECLAIMABLE'}") + print(f"{'Images':<15} {len(images):<12} {len([i for i in image_data if i['containers'] > 0]):<12} " + f"{format_size(total_size):<15} {format_size(total_reclaimable)} ({reclaimable_pct:.0f}%)") + print(f"{'Containers':<15} {len(volatile_layers):<12} {'0':<12} {'0B':<15} {'0B (0%)'}") + print(f"{'Local Volumes':<15} {'0':<12} {'0':<12} {'0B':<15} {'0B (0%)'}") + + # Print summary statistics + if verbose: + print("\n" + "="*80) + print("Storage Summary:") + print(f" Total Images: {len(images)}") + print(f" Images with containers: {len([i for i in image_data if i['containers'] > 0])}") + print(f" Total unique layers: {len(layer_count)}") + print(f" Shared layers (used by >1 image): {len([lid for lid, c in layer_count.items() if c > 1])}") + print(f" Total storage used: {format_size(total_size)}") + print(f" Reclaimable space: {format_size(total_reclaimable)} ({reclaimable_pct:.0f}%)") + print(f" Compressed download size: {format_size(comp_deduplicated)}") + print(f" Compression ratio: {total_size / comp_deduplicated:.1f}:1" if comp_deduplicated > 0 else "") + + +if __name__ == "__main__": + # Check for verbose flag + verbose = "-v" in sys.argv or "--verbose" in sys.argv + main(verbose) diff --git a/ansible/roles/common/tasks/files/prom-network-query.py b/ansible/roles/common/tasks/files/prom-network-query.py new file mode 100644 index 0000000000..e0fe0a7a48 --- /dev/null +++ b/ansible/roles/common/tasks/files/prom-network-query.py @@ -0,0 +1,395 @@ +#!/usr/bin/env python3 +"""Query Prometheus for network transfer during a measurement window. + +Usage: + prom-network-query.py [options] + prom-network-query.py [options] + prom-network-query.py [options] + +Options: + --prometheus URL Prometheus base URL (default: http://zen3:9091) + --instance HOST:PORT Prometheus instance label to filter by (e.g. microshift:9100) + --device IFACE Network device (default: enp5s0) + --step STEP Range query step (default: 15s) + +Uses increase() to get accurate totals that handle counter resets natively. +Also shows raw counter time series for visibility into resets. +""" + +import argparse +import http.client +import ipaddress +import json +import socket +import sys +import urllib.parse +from dataclasses import dataclass +from datetime import datetime, timezone + +DEFAULT_PROMETHEUS = "http://zen3:9091" +DEFAULT_DEVICE = "enp5s0" +DEFAULT_STEP = "15s" + + +class PromQueryError(RuntimeError): + """Raised when a Prometheus query fails or returns ambiguous data.""" + + +@dataclass(frozen=True) +class PrometheusEndpoint: + """Normalized Prometheus endpoint details.""" + + scheme: str + host: str + port: int + base_path: str = "" + + @property + def display_host(self): + if ":" in self.host and not self.host.startswith("["): + return f"[{self.host}]" + return self.host + + @property + def base_url(self): + return f"{self.scheme}://{self.display_host}:{self.port}{self.base_path}" + + +def _is_local_or_private_ip(value): + ip = ipaddress.ip_address(value) + return ip.is_private or ip.is_loopback or ip.is_link_local + + +def _validate_prometheus_host(host): + """Allow only loopback/link-local/private Prometheus endpoints.""" + try: + if not _is_local_or_private_ip(host): + raise argparse.ArgumentTypeError( + f"--prometheus host must be loopback/link-local/private, got {host!r}" + ) + return + except ValueError: + pass + + try: + infos = socket.getaddrinfo(host, None, type=socket.SOCK_STREAM) + except socket.gaierror as exc: + raise argparse.ArgumentTypeError( + f"--prometheus host {host!r} could not be resolved: {exc}" + ) from exc + + disallowed = set() + for _, _, _, _, sockaddr in infos: + resolved = sockaddr[0].split("%", 1)[0] # strip IPv6 scope suffix if present + try: + if not _is_local_or_private_ip(resolved): + disallowed.add(resolved) + except ValueError: + disallowed.add(resolved) + + if disallowed: + raise argparse.ArgumentTypeError( + f"--prometheus host {host!r} resolves to non-private address(es): {', '.join(sorted(disallowed))}" + ) + + +def _validate_prometheus_url(value): + """Validate and normalize --prometheus URL to prevent SSRF.""" + parsed = urllib.parse.urlparse(value) + if parsed.scheme not in ("http", "https"): + raise argparse.ArgumentTypeError( + f"--prometheus must use http or https, got: {parsed.scheme!r}" + ) + if parsed.username or parsed.password: + raise argparse.ArgumentTypeError("--prometheus must not include credentials") + if parsed.query or parsed.fragment or parsed.params: + raise argparse.ArgumentTypeError( + "--prometheus must not include query, fragment, or params" + ) + if parsed.hostname is None: + raise argparse.ArgumentTypeError("--prometheus must include a hostname") + + try: + port = parsed.port + except ValueError as exc: + raise argparse.ArgumentTypeError(f"--prometheus has invalid port: {exc}") from exc + + if port is None: + port = 443 if parsed.scheme == "https" else 80 + + base_path = parsed.path.rstrip("/") + _validate_prometheus_host(parsed.hostname) + return PrometheusEndpoint(parsed.scheme, parsed.hostname, port, base_path) + + +def parse_window_args(argv): + parser = argparse.ArgumentParser( + description="Query Prometheus for RX/TX transfer over a measurement window." + ) + parser.add_argument( + "window", + nargs="*", + help="Either , or , or empty for last 4h.", + ) + parser.add_argument( + "--prometheus", + default=DEFAULT_PROMETHEUS, + type=_validate_prometheus_url, + help=f"Prometheus base URL (default: {DEFAULT_PROMETHEUS})", + ) + parser.add_argument( + "--device", + default=DEFAULT_DEVICE, + help=f"Network device label value (default: {DEFAULT_DEVICE})", + ) + parser.add_argument( + "--instance", + default=None, + help="Prometheus instance label to filter by (e.g. microshift:9100)", + ) + parser.add_argument( + "--step", + default=DEFAULT_STEP, + help=f"Range query step for raw series output (default: {DEFAULT_STEP})", + ) + args = parser.parse_args(argv) + + if len(args.window) == 2: + try: + start = int(args.window[0]) + end = int(args.window[1]) + except ValueError as exc: + parser.error(f"start/end must be integers: {exc}") + elif len(args.window) == 1: + try: + hours = int(args.window[0]) + except ValueError as exc: + parser.error(f"hours must be an integer: {exc}") + now = int(datetime.now(timezone.utc).timestamp()) + start = now - hours * 3600 + end = now + elif len(args.window) == 0: + now = int(datetime.now(timezone.utc).timestamp()) + start = now - 4 * 3600 + end = now + else: + parser.error("Provide either , , or no positional args.") + + if end <= start: + parser.error("end_epoch must be greater than start_epoch.") + + return args, start, end + + +def prom_query(prometheus, endpoint, params): + qs = urllib.parse.urlencode(params) + api_path = f"{prometheus.base_path}/api/v1/{endpoint}" if prometheus.base_path else f"/api/v1/{endpoint}" + request_path = f"{api_path}?{qs}" + request_url = f"{prometheus.base_url}{request_path}" + connection_class = http.client.HTTPSConnection if prometheus.scheme == "https" else http.client.HTTPConnection + connection = connection_class(prometheus.host, prometheus.port, timeout=30) + + try: + connection.request("GET", request_path, headers={"Accept": "application/json"}) + response = connection.getresponse() + body = response.read() + except (http.client.HTTPException, OSError, TimeoutError) as exc: + raise PromQueryError(f"request failed for {request_url}: {exc}") from exc + finally: + connection.close() + + if response.status >= 400: + response_body = body.decode("utf-8", errors="replace").strip() + if response_body: + raise PromQueryError( + f"HTTP {response.status} from {request_url}: {response_body}" + ) + raise PromQueryError( + f"HTTP {response.status} from {request_url}: {response.reason}" + ) + + try: + data = json.loads(body) + except json.JSONDecodeError as exc: + raise PromQueryError(f"invalid JSON from {request_url}: {exc}") from exc + + if data.get("status") != "success": + error_type = data.get("errorType", "unknown_error") + error_msg = data.get("error", "no details") + raise PromQueryError( + f"Prometheus API error ({endpoint}): {error_type}: {error_msg}" + ) + + return data + + +def query_range(prometheus, query, start, end, step): + return prom_query(prometheus, "query_range", { + "query": query, "start": start, "end": end, "step": step, + }) + + +def query_instant(prometheus, query, time): + return prom_query(prometheus, "query", {"query": query, "time": time}) + + +def fmt_ts(epoch): + return datetime.fromtimestamp(epoch).strftime("%Y-%m-%d %H:%M:%S") + + +def fmt_bytes(b): + if b >= 1024 * 1024 * 1024: + return f"{b / 1024 / 1024 / 1024:.2f} GiB" + if b >= 1024 * 1024: + return f"{b / 1024 / 1024:.1f} MiB" + if b >= 1024: + return f"{b / 1024:.1f} KiB" + return f"{b} B" + + +def label_selector(device, instance=None): + """Build PromQL label selector string.""" + labels = f'device="{device}"' + if instance: + labels += f',instance="{instance}"' + return labels + + +def get_single_series(result, context): + """Return exactly one series; fail if response is ambiguous.""" + if not result: + return None + if len(result) > 1: + raise PromQueryError( + f"{context}: expected 1 series, got {len(result)}; " + "tighten labels (--instance/--device)." + ) + return result[0] + + +def get_increase(metric, start, end, prometheus, device, instance=None): + """Use increase() to get accurate total accounting for counter resets.""" + window = end - start + sel = label_selector(device, instance) + # Deduplicate label variants (job/pod/namespace/etc.) for the same host/device. + query = f"max by (instance, device) (increase({metric}{{{sel}}}[{window}s]))" + data = query_instant(prometheus, query, end) + series = get_single_series(data["data"]["result"], f"{metric} increase()") + if series is None: + return None + try: + return float(series["value"][1]) + except (KeyError, IndexError, TypeError, ValueError) as exc: + raise PromQueryError(f"{metric} increase(): unexpected response shape") from exc + + +def get_time_series(start, end, prometheus, device, step, instance=None): + """Get raw counter values to show resets.""" + sel = label_selector(device, instance) + # Deduplicate label variants (job/pod/namespace/etc.) for the same host/device. + query = f"max by (instance, device) (node_network_receive_bytes_total{{{sel}}})" + data = query_range(prometheus, query, start, end, step) + series = get_single_series(data["data"]["result"], "raw receive counter") + if series is None: + return [] + try: + return series["values"] + except (KeyError, TypeError) as exc: + raise PromQueryError("raw receive counter: unexpected response shape") from exc + + +def main(): + args, start, end = parse_window_args(sys.argv[1:]) + + duration = end - start + print(f"Measurement window: {fmt_ts(start)} -> {fmt_ts(end)} ({duration}s)") + print(f"Prometheus: {args.prometheus.base_url}") + print(f"Instance: {args.instance or '(all)'}") + print(f"Device: {args.device}") + print() + + try: + # Get increase() — the accurate total from Prometheus + rx_increase = get_increase( + "node_network_receive_bytes_total", start, end, + args.prometheus, args.device, args.instance, + ) + tx_increase = get_increase( + "node_network_transmit_bytes_total", start, end, + args.prometheus, args.device, args.instance, + ) + except PromQueryError as exc: + print(f"ERROR: {exc}", file=sys.stderr) + return 1 + + print("=== Network transfer (Prometheus increase) ===") + if rx_increase is not None: + print(f" RX: {fmt_bytes(rx_increase)} ({int(rx_increase):,} bytes)") + else: + print(" RX: no data") + if tx_increase is not None: + print(f" TX: {fmt_bytes(tx_increase)} ({int(tx_increase):,} bytes)") + else: + print(" TX: no data") + if rx_increase is not None and tx_increase is not None: + print(f" Total: {fmt_bytes(rx_increase + tx_increase)}") + print() + + try: + # Get raw time series for visibility + values = get_time_series( + start, end, args.prometheus, args.device, args.step, args.instance + ) + except PromQueryError as exc: + print(f"ERROR: {exc}", file=sys.stderr) + return 1 + + if not values: + print("No time series data available.") + return 0 + + print(f"=== Raw counter time series ({len(values)} samples, step={args.step}) ===") + print(f"{'timestamp':<24} {'rx_bytes':>16} {'rx_MB':>10} {'delta_MB':>10}") + print("-" * 64) + + prev_val = None + resets = [] + for ts, val in values: + val = int(float(val)) + dt = fmt_ts(ts) + mb = val / 1024 / 1024 + delta = "" + if prev_val is not None: + diff = val - prev_val + delta = f"{diff / 1024 / 1024:>10.2f}" + if diff < 0: + delta += " *** RESET ***" + resets.append((ts, prev_val, val)) + print(f"{dt:<24} {val:>16} {mb:>10.2f} {delta}") + prev_val = val + + if resets: + print() + print(f"Counter resets: {len(resets)}") + for ts, before, after in resets: + print(f" {fmt_ts(ts)}: {before:,} -> {after:,} " + f"(lost {fmt_bytes(before - after)})") + + # Output JSON summary for machine parsing + print() + summary = { + "start_epoch": start, + "end_epoch": end, + "duration_seconds": duration, + "device": args.device, + "rx_bytes": int(rx_increase) if rx_increase is not None else 0, + "tx_bytes": int(tx_increase) if tx_increase is not None else 0, + "counter_resets": len(resets), + "samples": len(values), + } + print(json.dumps(summary)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/ansible/roles/microshift-start/defaults/main.yml b/ansible/roles/microshift-start/defaults/main.yml index 5b5cb193cc..d20e545e16 100644 --- a/ansible/roles/microshift-start/defaults/main.yml +++ b/ansible/roles/microshift-start/defaults/main.yml @@ -12,5 +12,3 @@ du_dirs: - /usr/bin/* sample_interval: 5 - -vnstat_db: /var/lib/vnstat/vnstat.db diff --git a/ansible/roles/microshift-start/tasks/main.yml b/ansible/roles/microshift-start/tasks/main.yml index 3962406931..50a9ea4e4a 100644 --- a/ansible/roles/microshift-start/tasks/main.yml +++ b/ansible/roles/microshift-start/tasks/main.yml @@ -34,26 +34,6 @@ state: stopped enabled: no -- name: check for vnstat - ansible.builtin.command: rpm -q vnstat - register: vnstat_check - ignore_errors: true - -- name: vnstat cleanup - become: yes - block: - - name: stop & enable vnstat service - ansible.builtin.systemd: - name: vnstat - state: stopped - enabled: yes - - - name: delete vnstat db - ansible.builtin.file: - path: "{{ vnstat_db }}" - state: absent - when: vnstat_check.rc == 0 - - name: create .kube home dir ansible.builtin.file: path: ~/.kube/ @@ -82,7 +62,7 @@ block: - name: source pbench-agent & register-tool-set ansible.builtin.shell: source /etc/profile.d/pbench-agent.sh && pbench-register-tool-set - + - name: set new pidstat interval ansible.builtin.shell: source /etc/profile.d/pbench-agent.sh && pbench-register-tool --name=pidstat -- --interval={{ sample_interval }} @@ -91,7 +71,7 @@ async: "{{ pbench_record_duration|int * 2 }}" poll: 0 register: pbench_user_benchmark_result - + - name: Pause for 60 seconds to gather steady state for pbench tool recording ansible.builtin.pause: seconds: 60 @@ -104,6 +84,10 @@ register: cadvisor_check ignore_errors: true +- name: record network measurement start epoch + ansible.builtin.command: date +%s + register: network_start_epoch + - name: measure microshift service boot time include_tasks: roles/common/tasks/boot.yml vars: @@ -138,22 +122,68 @@ include_tasks: roles/common/tasks/disk.yml loop: "{{ du_dirs }}" -- name: vnstat collection tasks +- name: capture container image sizes block: - - name: wait for vnstat db to populate - ansible.builtin.shell: vnstat | grep today - retries: 60 - delay: 10 - register: vnstat_db - until: vnstat_db.rc == 0 - - - name: get vnstat network usage - ansible.builtin.command: vnstat - register: vnstat - - - name: record network usage to file + - name: copy crio-df script to remote host + become: yes + ansible.builtin.copy: + src: roles/common/tasks/files/crio-df.py + dest: /tmp/crio-df.py + mode: '0755' + + - name: run crio-df script + become: yes + ansible.builtin.command: python3 /tmp/crio-df.py -v + register: crio_df + + - name: save container image sizes to results + ansible.builtin.copy: + content: "{{ crio_df.stdout }}" + dest: "{{ results_dir }}/images.txt" + delegate_to: localhost + +- name: record network measurement end epoch + ansible.builtin.command: date +%s + register: network_end_epoch + +- name: resolve prometheus network query settings + ansible.builtin.set_fact: + prom_network_endpoint: >- + {{ + prometheus_query_endpoint | default( + 'http://' ~ + ( + ( + hostvars[(groups['logging'] | default([]) | first)].ansible_host + | default((groups['logging'] | default([]) | first)) + ) + if (groups['logging'] | default([]) | length > 0) + else 'zen3' + ) + ~ ':' ~ (prometheus_port | default(9091) | string), + true + ) + }} + prom_network_device: "{{ network_device | default(ansible_default_ipv4.interface | default('enp5s0'), true) }}" + prom_network_instance: "{{ inventory_hostname }}:9100" + when: prometheus_logging | bool + +- name: query prometheus for network transfer + block: + - name: run prometheus network query + ansible.builtin.command: > + python3 roles/common/tasks/files/prom-network-query.py + --prometheus {{ prom_network_endpoint }} + --instance {{ prom_network_instance }} + --device {{ prom_network_device }} + {{ network_start_epoch.stdout }} + {{ network_end_epoch.stdout }} + register: prom_network + delegate_to: localhost + + - name: save network data to results ansible.builtin.copy: - content: "{{ vnstat.stdout }}" + content: "{{ prom_network.stdout }}" dest: "{{ results_dir }}/network.txt" delegate_to: localhost - when: vnstat_check.rc == 0 + when: prometheus_logging | bool