From 1d6ee8a2a5fb45766d6f3f8f3ed9660e60169069 Mon Sep 17 00:00:00 2001
From: Sebastian Jug <seb@stianj.ug>
Date: Tue, 24 Feb 2026 09:26:18 -0500
Subject: [PATCH 1/4] ready.sh: Add timestamps and not ready pod tracking

---
 ansible/roles/common/tasks/files/ready.sh | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/ansible/roles/common/tasks/files/ready.sh b/ansible/roles/common/tasks/files/ready.sh
index 70c73294c3..ff8db1d24c 100755
--- a/ansible/roles/common/tasks/files/ready.sh
+++ b/ansible/roles/common/tasks/files/ready.sh
@@ -123,7 +123,12 @@ wait_for_ready() {
 
     # Print progress when pod count changes
     if [[ ${ready} -ne ${prev_ready} ]]; then
-      echo "${label}: ${ready}/${expected} ready"
+      elapsed=$(( $(date +%s) - START_TIME ))
+      echo "[${elapsed}s] ${label}: ${ready}/${expected} ready"
+      # Show which pods are not yet ready
+      oc get pods -A -o json \
+        | jq -r '.items[] | select(.status.conditions[]? | select(.type=="Ready" and .status!="True")) | "  NOT READY: \(.metadata.namespace)/\(.metadata.name)"' \
+        2>/dev/null || true
       prev_ready=${ready}
     fi
 
@@ -154,4 +159,5 @@ READY_SECONDS_ALL=""
 wait_for_ready "Non-storage pods" "${EXPECTED_PODS}" count_ready_nostorage READY_SECONDS_NON_STORAGE
 wait_for_ready "All pods" "${ALL_PODS}" count_ready_all READY_SECONDS_ALL
 
-echo "{\"ready_seconds_non_storage\":${READY_SECONDS_NON_STORAGE},\"ready_seconds_all\":${READY_SECONDS_ALL}}"
+END_TIME=$(date +%s)
+echo "{\"ready_seconds_non_storage\":${READY_SECONDS_NON_STORAGE},\"ready_seconds_all\":${READY_SECONDS_ALL},\"start_epoch\":${START_TIME},\"end_epoch\":${END_TIME}}"

From d946858cdbee9cc5ba8c1d4db8972e1910f13704 Mon Sep 17 00:00:00 2001
From: Sebastian Jug <seb@stianj.ug>
Date: Tue, 24 Feb 2026 09:27:16 -0500
Subject: [PATCH 2/4] install-microshift: Clean baseline before initial disk
 measurement

---
 .../roles/install-microshift/tasks/main.yml   | 38 +++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/ansible/roles/install-microshift/tasks/main.yml b/ansible/roles/install-microshift/tasks/main.yml
index bedbe2e218..cf34521892 100644
--- a/ansible/roles/install-microshift/tasks/main.yml
+++ b/ansible/roles/install-microshift/tasks/main.yml
@@ -1,6 +1,44 @@
 ---
 # install-microshift tasks
 
+- name: check if microshift is installed
+  ansible.builtin.command: rpm -q microshift
+  register: microshift_installed
+  ignore_errors: true
+  changed_when: false
+
+- name: cleanup and uninstall existing microshift
+  block:
+    - name: stop microshift service
+      become: yes
+      ansible.builtin.systemd:
+        name: microshift
+        state: stopped
+      ignore_errors: true
+
+    - name: find microshift cleanup script
+      ansible.builtin.find:
+        paths: /usr/bin
+        patterns:
+          - '.*microshift.*cleanup.*'
+          - '.*cleanup.*microshift.*'
+        use_regex: yes
+      register: find_cleanup
+
+    - name: cleanup microshift data
+      become: yes
+      ansible.builtin.shell: echo 1 | {{ find_cleanup.files[0].path }} --all
+      when: find_cleanup.files | length > 0
+
+    - name: uninstall microshift packages and dependencies
+      become: yes
+      ansible.builtin.dnf:
+        name:
+          - microshift*
+        autoremove: yes
+        state: absent
+  when: microshift_installed.rc == 0
+
 - name: record initial disk space
   vars:
     filename: disk0.txt

From 000caeb97c0d04c3acaa7f9be8ef81e54b629494 Mon Sep 17 00:00:00 2001
From: Sebastian Jug <seb@stianj.ug>
Date: Tue, 24 Feb 2026 09:28:13 -0500
Subject: [PATCH 3/4] install-logging-exporters: Fix node exporter package and
 service names

---
 ansible/roles/install-logging-exporters/defaults/main.yml | 2 +-
 ansible/roles/install-logging-exporters/tasks/main.yml    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ansible/roles/install-logging-exporters/defaults/main.yml b/ansible/roles/install-logging-exporters/defaults/main.yml
index 8644dfb8e6..a6f6325d19 100644
--- a/ansible/roles/install-logging-exporters/defaults/main.yml
+++ b/ansible/roles/install-logging-exporters/defaults/main.yml
@@ -6,4 +6,4 @@ process_exporter_url: https://github.com/ncabatoff/process-exporter/releases/dow
 
 prometheus_services:
   - process-exporter
-  - prometheus-node-exporter
+  - node_exporter
diff --git a/ansible/roles/install-logging-exporters/tasks/main.yml b/ansible/roles/install-logging-exporters/tasks/main.yml
index 55d9b6b5b4..78f42daf8c 100644
--- a/ansible/roles/install-logging-exporters/tasks/main.yml
+++ b/ansible/roles/install-logging-exporters/tasks/main.yml
@@ -1,7 +1,7 @@
 - name: install node-exporter
   ansible.builtin.dnf:
     name:
-    - golang-github-prometheus-node-exporter
+    - node-exporter
     state: present
 
 - name: install process-exporter

From 47823b54260dfa9acd55040aa413a3863dfa10cb Mon Sep 17 00:00:00 2001
From: Sebastian Jug <seb@stianj.ug>
Date: Tue, 24 Feb 2026 09:32:02 -0500
Subject: [PATCH 4/4] microshift-start: Add Prometheus network and image size
 metrics

---
 ansible/roles/common/tasks/files/crio-df.py   | 347 +++++++++++++++
 .../common/tasks/files/prom-network-query.py  | 395 ++++++++++++++++++
 .../roles/microshift-start/defaults/main.yml  |   2 -
 ansible/roles/microshift-start/tasks/main.yml | 104 +++--
 4 files changed, 809 insertions(+), 39 deletions(-)
 create mode 100644 ansible/roles/common/tasks/files/crio-df.py
 create mode 100644 ansible/roles/common/tasks/files/prom-network-query.py

diff --git a/ansible/roles/common/tasks/files/crio-df.py b/ansible/roles/common/tasks/files/crio-df.py
new file mode 100644
index 0000000000..01d89fed93
--- /dev/null
+++ b/ansible/roles/common/tasks/files/crio-df.py
@@ -0,0 +1,347 @@
+#!/usr/bin/env python3
+"""
+CRI-O Storage Disk Usage Reporter
+A replacement for 'podman system df -v' that correctly calculates SharedSize for CRI-O storage.
+Also calculates compressed (download) sizes from locally stored registry manifests.
+"""
+
+import json
+import os
+import sys
+from collections import defaultdict
+from datetime import datetime, timezone
+
+# Configuration
+STORAGE_ROOT = "/var/lib/containers/storage"
+IMAGES_DIR = f"{STORAGE_ROOT}/overlay-images"
+IMAGES_JSON = f"{IMAGES_DIR}/images.json"
+LAYERS_JSON = f"{STORAGE_ROOT}/overlay-layers/layers.json"
+VOLATILE_LAYERS_JSON = f"{STORAGE_ROOT}/overlay-layers/volatile-layers.json"
+
+
+def load_json_file(filepath):
+    """Load and parse a JSON file."""
+    try:
+        with open(filepath, 'r') as f:
+            return json.load(f)
+    except (FileNotFoundError, json.JSONDecodeError):
+        return []
+    except Exception as e:
+        print(f"Warning: Could not load {filepath}: {e}", file=sys.stderr)
+        return []
+
+
+def format_size(bytes_value):
+    """Format bytes as human-readable string using decimal (SI) units to match podman."""
+    if bytes_value == 0:
+        return "0B"
+
+    units = ['B', 'KB', 'MB', 'GB', 'TB']
+    unit_index = 0
+    size = float(bytes_value)
+
+    # Use 1000 (decimal) instead of 1024 (binary) to match podman's output
+    while size >= 1000 and unit_index < len(units) - 1:
+        size /= 1000
+        unit_index += 1
+
+    # Format with appropriate precision
+    if unit_index == 0:  # Bytes
+        return f"{int(size)}{units[unit_index]}"
+    elif size >= 100:
+        return f"{size:.0f}{units[unit_index]}"
+    elif size >= 10:
+        return f"{size:.1f}{units[unit_index]}"
+    else:
+        return f"{size:.2f}{units[unit_index]}"
+
+
+def format_time_ago(timestamp):
+    """Format timestamp as 'X days/weeks/months ago'."""
+    if not timestamp:
+        return "Unknown"
+
+    try:
+        # Parse the timestamp
+        if isinstance(timestamp, str):
+            # Handle ISO format with timezone
+            created = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
+        else:
+            created = datetime.fromtimestamp(timestamp, tz=timezone.utc)
+
+        now = datetime.now(timezone.utc)
+        diff = now - created
+
+        days = diff.days
+        if days == 0:
+            hours = diff.seconds // 3600
+            if hours == 0:
+                return "Just now"
+            return f"{hours} hour{'s' if hours > 1 else ''} ago"
+        elif days == 1:
+            return "Yesterday"
+        elif days < 7:
+            return f"{days} days ago"
+        elif days < 30:
+            weeks = days // 7
+            return f"{weeks} week{'s' if weeks > 1 else ''} ago"
+        elif days < 365:
+            months = days // 30
+            return f"{months} month{'s' if months > 1 else ''} ago"
+        else:
+            years = days // 365
+            return f"{years} year{'s' if years > 1 else ''} ago"
+    except Exception:
+        return "Unknown"
+
+
+def walk_image_layers(image, layers_by_id):
+    """Walk the layer chain from TopLayer up through parents."""
+    layers_walked = []
+    visited = set()
+
+    # Start from TopLayer
+    layer_id = image.get("layer")
+
+    # Walk up the parent chain
+    while layer_id and layer_id not in visited:
+        visited.add(layer_id)
+
+        layer = layers_by_id.get(layer_id)
+        if not layer:
+            break
+
+        layers_walked.append(layer_id)
+        layer_id = layer.get("parent")
+
+    return layers_walked
+
+
+def get_image_display_name(image):
+    """Get the best display name for an image."""
+    names = image.get("names", [])
+    if not names:
+        return ("<none>", "<none>")
+
+    # Use the first name
+    name = names[0]
+
+    # Remove digest if present
+    if "@sha256:" in name:
+        name = name.split("@")[0]
+
+    # Split into repository and tag
+    if ":" in name:
+        parts = name.rsplit(":", 1)
+        # If the right side still contains '/', this ':' belongs to a registry
+        # host:port segment and the image is untagged.
+        if "/" in parts[1]:
+            return (name, "latest")
+        return (parts[0], parts[1])
+    else:
+        return (name, "latest")
+
+
+def get_compressed_sizes(images):
+    """Read registry manifests from local storage to get compressed layer sizes."""
+    all_layers = {}          # digest -> compressed size
+    layer_usage = defaultdict(int)
+    per_image = []
+
+    for image in images:
+        image_id = image["id"]
+        manifest_path = os.path.join(IMAGES_DIR, image_id, "manifest")
+        if not os.path.exists(manifest_path):
+            continue
+
+        manifest = load_json_file(manifest_path)
+        if not manifest:
+            continue
+
+        layers = manifest.get("layers", [])
+        image_compressed = 0
+        image_layers = []
+
+        for layer in layers:
+            digest = layer.get("digest", "")
+            size = layer.get("size", 0)
+            image_compressed += size
+            image_layers.append({"digest": digest, "size": size})
+            all_layers[digest] = size
+            layer_usage[digest] += 1
+
+        repo, _ = get_image_display_name(image)
+        per_image.append({
+            "id": image_id[:12],
+            "name": repo,
+            "compressed": image_compressed,
+            "layers": image_layers,
+        })
+
+    shared_digests = {d for d, c in layer_usage.items() if c > 1}
+    return all_layers, layer_usage, shared_digests, per_image
+
+
+def main(verbose=False):
+    """Main function to display CRI-O storage disk usage."""
+
+    # Check if running as root
+    if os.geteuid() != 0:
+        print("Error: This script must be run as root (sudo)", file=sys.stderr)
+        sys.exit(1)
+
+    # Load data
+    images = load_json_file(IMAGES_JSON)
+    layers = load_json_file(LAYERS_JSON)
+    volatile_layers = load_json_file(VOLATILE_LAYERS_JSON)
+
+    if not images:
+        print("No images found in CRI-O storage")
+        return
+
+    # Create layer lookup map
+    layers_by_id = {layer["id"]: layer for layer in layers}
+
+    # Count layer usage across all images
+    layer_count = defaultdict(int)
+    image_layers = {}
+
+    for image in images:
+        image_id = image["id"]
+        walked_layers = walk_image_layers(image, layers_by_id)
+        image_layers[image_id] = walked_layers
+
+        for layer_id in walked_layers:
+            layer_count[layer_id] += 1
+
+    # Calculate total storage first (each layer counted exactly once)
+    total_size = 0
+    for layer_id in layer_count.keys():
+        layer = layers_by_id[layer_id]
+        diff_size = layer.get("diff-size")
+        size = diff_size if diff_size is not None else (layer.get("uncompress_size", 0) or 0)
+        total_size += size
+
+    # Map image top layers to image IDs for container counting
+    layer_to_image = {}
+    for img in images:
+        layer_to_image[img.get("layer")] = img["id"]
+
+    # Count containers per image using volatile layers (matching podman's method)
+    containers_per_image = defaultdict(int)
+    for container in volatile_layers:
+        parent = container.get("parent")
+        if parent in layer_to_image:
+            image_id = layer_to_image[parent]
+            containers_per_image[image_id] += 1
+
+    # Calculate sizes for each image
+    image_data = []
+    total_reclaimable = 0
+
+    for image in images:
+        image_id = image["id"]
+        walked_layers = image_layers[image_id]
+
+        shared_size = 0
+        unique_size = 0
+
+        for layer_id in walked_layers:
+            layer = layers_by_id[layer_id]
+            diff_size = layer.get("diff-size")
+            size = diff_size if diff_size is not None else (layer.get("uncompress_size", 0) or 0)
+
+            if layer_count[layer_id] > 1:
+                shared_size += size
+            else:
+                unique_size += size
+
+        image_size = shared_size + unique_size
+
+        # Count containers using this image (from volatile layers)
+        container_count = containers_per_image.get(image_id, 0)
+
+        # Get image metadata
+        repo, tag = get_image_display_name(image)
+        created = format_time_ago(image.get("created"))
+
+        image_data.append({
+            "id": image_id[:12],
+            "repository": repo,
+            "tag": tag,
+            "created": created,
+            "size": image_size,
+            "shared_size": shared_size,
+            "unique_size": unique_size,
+            "containers": container_count,
+            "reclaimable": unique_size if container_count == 0 else 0
+        })
+
+        if container_count == 0:
+            total_reclaimable += unique_size
+
+    # Calculate reclaimable percentage
+    reclaimable_pct = (total_reclaimable / total_size * 100) if total_size > 0 else 0
+
+    # Calculate compressed download sizes
+    comp_layers, _, comp_shared, comp_per_image = get_compressed_sizes(images)
+    comp_deduplicated = sum(comp_layers.values())
+
+    if verbose:
+        # Detailed output similar to 'podman system df -v'
+        print("Images space usage:\n")
+        print(f"{'REPOSITORY':<55} {'TAG':<12} {'IMAGE ID':<12} {'CREATED':<10} {'SIZE':<12} {'SHARED SIZE':<12} {'UNIQUE SIZE':<12} {'CONTAINERS'}")
+
+        for img in sorted(image_data, key=lambda x: x["size"], reverse=True):
+            print(f"{img['repository']:<55} {img['tag']:<12} {img['id']:<12} {img['created']:<10} "
+                  f"{format_size(img['size']):<12} {format_size(img['shared_size']):<12} "
+                  f"{format_size(img['unique_size']):<12} {img['containers']}")
+
+        print("\nContainers space usage:\n")
+        print(f"{'CONTAINER ID':<12} {'IMAGE':<35} {'COMMAND':<20} {'LOCAL VOLUMES':<15} {'SIZE':<12} {'CREATED':<12} {'STATUS':<12} {'NAMES'}")
+
+        # Container layers from volatile-layers.json
+        for container in volatile_layers[:10]:  # Limit to first 10
+            container_id = container.get("id", "")[:12]
+            parent = container.get("parent", "")
+            image = layer_to_image.get(parent, "")[:12] if parent in layer_to_image else "N/A"
+            print(f"{container_id:<12} {image:<35} {'N/A':<20} {'0':<15} {'N/A':<12} {'N/A':<12} {'N/A':<12} {'N/A'}")
+
+        print("\nLocal Volumes space usage:\n")
+        print(f"{'VOLUME NAME':<30} {'LINKS':<10} {'SIZE'}")
+        # No volume info in CRI-O context
+
+        # Compressed download sizes
+        print("\nCompressed download sizes:\n")
+        for img in sorted(comp_per_image, key=lambda x: x["compressed"], reverse=True):
+            print(f"{img['id']}  {format_size(img['compressed']):>10}  {img['name']}")
+            for layer in img["layers"]:
+                marker = " *" if layer["digest"] in comp_shared else ""
+                print(f"  {layer['digest'][:19]}...  {format_size(layer['size']):>10}{marker}")
+
+    else:
+        # Summary output similar to 'podman system df'
+        print(f"{'TYPE':<15} {'TOTAL':<12} {'ACTIVE':<12} {'SIZE':<15} {'RECLAIMABLE'}")
+        print(f"{'Images':<15} {len(images):<12} {len([i for i in image_data if i['containers'] > 0]):<12} "
+              f"{format_size(total_size):<15} {format_size(total_reclaimable)} ({reclaimable_pct:.0f}%)")
+        print(f"{'Containers':<15} {len(volatile_layers):<12} {'0':<12} {'0B':<15} {'0B (0%)'}")
+        print(f"{'Local Volumes':<15} {'0':<12} {'0':<12} {'0B':<15} {'0B (0%)'}")
+
+    # Print summary statistics
+    if verbose:
+        print("\n" + "="*80)
+        print("Storage Summary:")
+        print(f"  Total Images: {len(images)}")
+        print(f"  Images with containers: {len([i for i in image_data if i['containers'] > 0])}")
+        print(f"  Total unique layers: {len(layer_count)}")
+        print(f"  Shared layers (used by >1 image): {len([lid for lid, c in layer_count.items() if c > 1])}")
+        print(f"  Total storage used: {format_size(total_size)}")
+        print(f"  Reclaimable space: {format_size(total_reclaimable)} ({reclaimable_pct:.0f}%)")
+        print(f"  Compressed download size: {format_size(comp_deduplicated)}")
+        print(f"  Compression ratio: {total_size / comp_deduplicated:.1f}:1" if comp_deduplicated > 0 else "")
+
+
+if __name__ == "__main__":
+    # Check for verbose flag
+    verbose = "-v" in sys.argv or "--verbose" in sys.argv
+    main(verbose)
diff --git a/ansible/roles/common/tasks/files/prom-network-query.py b/ansible/roles/common/tasks/files/prom-network-query.py
new file mode 100644
index 0000000000..e0fe0a7a48
--- /dev/null
+++ b/ansible/roles/common/tasks/files/prom-network-query.py
@@ -0,0 +1,395 @@
+#!/usr/bin/env python3
+"""Query Prometheus for network transfer during a measurement window.
+
+Usage:
+  prom-network-query.py [options] <start_epoch> <end_epoch>
+  prom-network-query.py [options] <hours>
+  prom-network-query.py [options]
+
+Options:
+  --prometheus URL        Prometheus base URL (default: http://zen3:9091)
+  --instance HOST:PORT    Prometheus instance label to filter by (e.g. microshift:9100)
+  --device IFACE          Network device (default: enp5s0)
+  --step STEP             Range query step (default: 15s)
+
+Uses increase() to get accurate totals that handle counter resets natively.
+Also shows raw counter time series for visibility into resets.
+"""
+
+import argparse
+import http.client
+import ipaddress
+import json
+import socket
+import sys
+import urllib.parse
+from dataclasses import dataclass
+from datetime import datetime, timezone
+
+DEFAULT_PROMETHEUS = "http://zen3:9091"
+DEFAULT_DEVICE = "enp5s0"
+DEFAULT_STEP = "15s"
+
+
+class PromQueryError(RuntimeError):
+    """Raised when a Prometheus query fails or returns ambiguous data."""
+
+
+@dataclass(frozen=True)
+class PrometheusEndpoint:
+    """Normalized Prometheus endpoint details."""
+
+    scheme: str
+    host: str
+    port: int
+    base_path: str = ""
+
+    @property
+    def display_host(self):
+        if ":" in self.host and not self.host.startswith("["):
+            return f"[{self.host}]"
+        return self.host
+
+    @property
+    def base_url(self):
+        return f"{self.scheme}://{self.display_host}:{self.port}{self.base_path}"
+
+
+def _is_local_or_private_ip(value):
+    ip = ipaddress.ip_address(value)
+    return ip.is_private or ip.is_loopback or ip.is_link_local
+
+
+def _validate_prometheus_host(host):
+    """Allow only loopback/link-local/private Prometheus endpoints."""
+    try:
+        if not _is_local_or_private_ip(host):
+            raise argparse.ArgumentTypeError(
+                f"--prometheus host must be loopback/link-local/private, got {host!r}"
+            )
+        return
+    except ValueError:
+        pass
+
+    try:
+        infos = socket.getaddrinfo(host, None, type=socket.SOCK_STREAM)
+    except socket.gaierror as exc:
+        raise argparse.ArgumentTypeError(
+            f"--prometheus host {host!r} could not be resolved: {exc}"
+        ) from exc
+
+    disallowed = set()
+    for _, _, _, _, sockaddr in infos:
+        resolved = sockaddr[0].split("%", 1)[0]  # strip IPv6 scope suffix if present
+        try:
+            if not _is_local_or_private_ip(resolved):
+                disallowed.add(resolved)
+        except ValueError:
+            disallowed.add(resolved)
+
+    if disallowed:
+        raise argparse.ArgumentTypeError(
+            f"--prometheus host {host!r} resolves to non-private address(es): {', '.join(sorted(disallowed))}"
+        )
+
+
+def _validate_prometheus_url(value):
+    """Validate and normalize --prometheus URL to prevent SSRF."""
+    parsed = urllib.parse.urlparse(value)
+    if parsed.scheme not in ("http", "https"):
+        raise argparse.ArgumentTypeError(
+            f"--prometheus must use http or https, got: {parsed.scheme!r}"
+        )
+    if parsed.username or parsed.password:
+        raise argparse.ArgumentTypeError("--prometheus must not include credentials")
+    if parsed.query or parsed.fragment or parsed.params:
+        raise argparse.ArgumentTypeError(
+            "--prometheus must not include query, fragment, or params"
+        )
+    if parsed.hostname is None:
+        raise argparse.ArgumentTypeError("--prometheus must include a hostname")
+
+    try:
+        port = parsed.port
+    except ValueError as exc:
+        raise argparse.ArgumentTypeError(f"--prometheus has invalid port: {exc}") from exc
+
+    if port is None:
+        port = 443 if parsed.scheme == "https" else 80
+
+    base_path = parsed.path.rstrip("/")
+    _validate_prometheus_host(parsed.hostname)
+    return PrometheusEndpoint(parsed.scheme, parsed.hostname, port, base_path)
+
+
+def parse_window_args(argv):
+    parser = argparse.ArgumentParser(
+        description="Query Prometheus for RX/TX transfer over a measurement window."
+    )
+    parser.add_argument(
+        "window",
+        nargs="*",
+        help="Either <start_epoch> <end_epoch>, or <hours>, or empty for last 4h.",
+    )
+    parser.add_argument(
+        "--prometheus",
+        default=DEFAULT_PROMETHEUS,
+        type=_validate_prometheus_url,
+        help=f"Prometheus base URL (default: {DEFAULT_PROMETHEUS})",
+    )
+    parser.add_argument(
+        "--device",
+        default=DEFAULT_DEVICE,
+        help=f"Network device label value (default: {DEFAULT_DEVICE})",
+    )
+    parser.add_argument(
+        "--instance",
+        default=None,
+        help="Prometheus instance label to filter by (e.g. microshift:9100)",
+    )
+    parser.add_argument(
+        "--step",
+        default=DEFAULT_STEP,
+        help=f"Range query step for raw series output (default: {DEFAULT_STEP})",
+    )
+    args = parser.parse_args(argv)
+
+    if len(args.window) == 2:
+        try:
+            start = int(args.window[0])
+            end = int(args.window[1])
+        except ValueError as exc:
+            parser.error(f"start/end must be integers: {exc}")
+    elif len(args.window) == 1:
+        try:
+            hours = int(args.window[0])
+        except ValueError as exc:
+            parser.error(f"hours must be an integer: {exc}")
+        now = int(datetime.now(timezone.utc).timestamp())
+        start = now - hours * 3600
+        end = now
+    elif len(args.window) == 0:
+        now = int(datetime.now(timezone.utc).timestamp())
+        start = now - 4 * 3600
+        end = now
+    else:
+        parser.error("Provide either <start_epoch> <end_epoch>, <hours>, or no positional args.")
+
+    if end <= start:
+        parser.error("end_epoch must be greater than start_epoch.")
+
+    return args, start, end
+
+
+def prom_query(prometheus, endpoint, params):
+    qs = urllib.parse.urlencode(params)
+    api_path = f"{prometheus.base_path}/api/v1/{endpoint}" if prometheus.base_path else f"/api/v1/{endpoint}"
+    request_path = f"{api_path}?{qs}"
+    request_url = f"{prometheus.base_url}{request_path}"
+    connection_class = http.client.HTTPSConnection if prometheus.scheme == "https" else http.client.HTTPConnection
+    connection = connection_class(prometheus.host, prometheus.port, timeout=30)
+
+    try:
+        connection.request("GET", request_path, headers={"Accept": "application/json"})
+        response = connection.getresponse()
+        body = response.read()
+    except (http.client.HTTPException, OSError, TimeoutError) as exc:
+        raise PromQueryError(f"request failed for {request_url}: {exc}") from exc
+    finally:
+        connection.close()
+
+    if response.status >= 400:
+        response_body = body.decode("utf-8", errors="replace").strip()
+        if response_body:
+            raise PromQueryError(
+                f"HTTP {response.status} from {request_url}: {response_body}"
+            )
+        raise PromQueryError(
+            f"HTTP {response.status} from {request_url}: {response.reason}"
+        )
+
+    try:
+        data = json.loads(body)
+    except json.JSONDecodeError as exc:
+        raise PromQueryError(f"invalid JSON from {request_url}: {exc}") from exc
+
+    if data.get("status") != "success":
+        error_type = data.get("errorType", "unknown_error")
+        error_msg = data.get("error", "no details")
+        raise PromQueryError(
+            f"Prometheus API error ({endpoint}): {error_type}: {error_msg}"
+        )
+
+    return data
+
+
+def query_range(prometheus, query, start, end, step):
+    return prom_query(prometheus, "query_range", {
+        "query": query, "start": start, "end": end, "step": step,
+    })
+
+
+def query_instant(prometheus, query, time):
+    return prom_query(prometheus, "query", {"query": query, "time": time})
+
+
+def fmt_ts(epoch):
+    return datetime.fromtimestamp(epoch).strftime("%Y-%m-%d %H:%M:%S")
+
+
+def fmt_bytes(b):
+    if b >= 1024 * 1024 * 1024:
+        return f"{b / 1024 / 1024 / 1024:.2f} GiB"
+    if b >= 1024 * 1024:
+        return f"{b / 1024 / 1024:.1f} MiB"
+    if b >= 1024:
+        return f"{b / 1024:.1f} KiB"
+    return f"{b} B"
+
+
+def label_selector(device, instance=None):
+    """Build PromQL label selector string."""
+    labels = f'device="{device}"'
+    if instance:
+        labels += f',instance="{instance}"'
+    return labels
+
+
+def get_single_series(result, context):
+    """Return exactly one series; fail if response is ambiguous."""
+    if not result:
+        return None
+    if len(result) > 1:
+        raise PromQueryError(
+            f"{context}: expected 1 series, got {len(result)}; "
+            "tighten labels (--instance/--device)."
+        )
+    return result[0]
+
+
+def get_increase(metric, start, end, prometheus, device, instance=None):
+    """Use increase() to get accurate total accounting for counter resets."""
+    window = end - start
+    sel = label_selector(device, instance)
+    # Deduplicate label variants (job/pod/namespace/etc.) for the same host/device.
+    query = f"max by (instance, device) (increase({metric}{{{sel}}}[{window}s]))"
+    data = query_instant(prometheus, query, end)
+    series = get_single_series(data["data"]["result"], f"{metric} increase()")
+    if series is None:
+        return None
+    try:
+        return float(series["value"][1])
+    except (KeyError, IndexError, TypeError, ValueError) as exc:
+        raise PromQueryError(f"{metric} increase(): unexpected response shape") from exc
+
+
+def get_time_series(start, end, prometheus, device, step, instance=None):
+    """Get raw counter values to show resets."""
+    sel = label_selector(device, instance)
+    # Deduplicate label variants (job/pod/namespace/etc.) for the same host/device.
+    query = f"max by (instance, device) (node_network_receive_bytes_total{{{sel}}})"
+    data = query_range(prometheus, query, start, end, step)
+    series = get_single_series(data["data"]["result"], "raw receive counter")
+    if series is None:
+        return []
+    try:
+        return series["values"]
+    except (KeyError, TypeError) as exc:
+        raise PromQueryError("raw receive counter: unexpected response shape") from exc
+
+
+def main():
+    args, start, end = parse_window_args(sys.argv[1:])
+
+    duration = end - start
+    print(f"Measurement window: {fmt_ts(start)} -> {fmt_ts(end)} ({duration}s)")
+    print(f"Prometheus: {args.prometheus.base_url}")
+    print(f"Instance: {args.instance or '(all)'}")
+    print(f"Device: {args.device}")
+    print()
+
+    try:
+        # Get increase() — the accurate total from Prometheus
+        rx_increase = get_increase(
+            "node_network_receive_bytes_total", start, end,
+            args.prometheus, args.device, args.instance,
+        )
+        tx_increase = get_increase(
+            "node_network_transmit_bytes_total", start, end,
+            args.prometheus, args.device, args.instance,
+        )
+    except PromQueryError as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        return 1
+
+    print("=== Network transfer (Prometheus increase) ===")
+    if rx_increase is not None:
+        print(f"  RX: {fmt_bytes(rx_increase)} ({int(rx_increase):,} bytes)")
+    else:
+        print("  RX: no data")
+    if tx_increase is not None:
+        print(f"  TX: {fmt_bytes(tx_increase)} ({int(tx_increase):,} bytes)")
+    else:
+        print("  TX: no data")
+    if rx_increase is not None and tx_increase is not None:
+        print(f"  Total: {fmt_bytes(rx_increase + tx_increase)}")
+    print()
+
+    try:
+        # Get raw time series for visibility
+        values = get_time_series(
+            start, end, args.prometheus, args.device, args.step, args.instance
+        )
+    except PromQueryError as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        return 1
+
+    if not values:
+        print("No time series data available.")
+        return 0
+
+    print(f"=== Raw counter time series ({len(values)} samples, step={args.step}) ===")
+    print(f"{'timestamp':<24} {'rx_bytes':>16} {'rx_MB':>10} {'delta_MB':>10}")
+    print("-" * 64)
+
+    prev_val = None
+    resets = []
+    for ts, val in values:
+        val = int(float(val))
+        dt = fmt_ts(ts)
+        mb = val / 1024 / 1024
+        delta = ""
+        if prev_val is not None:
+            diff = val - prev_val
+            delta = f"{diff / 1024 / 1024:>10.2f}"
+            if diff < 0:
+                delta += " *** RESET ***"
+                resets.append((ts, prev_val, val))
+        print(f"{dt:<24} {val:>16} {mb:>10.2f} {delta}")
+        prev_val = val
+
+    if resets:
+        print()
+        print(f"Counter resets: {len(resets)}")
+        for ts, before, after in resets:
+            print(f"  {fmt_ts(ts)}: {before:,} -> {after:,} "
+                  f"(lost {fmt_bytes(before - after)})")
+
+    # Output JSON summary for machine parsing
+    print()
+    summary = {
+        "start_epoch": start,
+        "end_epoch": end,
+        "duration_seconds": duration,
+        "device": args.device,
+        "rx_bytes": int(rx_increase) if rx_increase is not None else 0,
+        "tx_bytes": int(tx_increase) if tx_increase is not None else 0,
+        "counter_resets": len(resets),
+        "samples": len(values),
+    }
+    print(json.dumps(summary))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/ansible/roles/microshift-start/defaults/main.yml b/ansible/roles/microshift-start/defaults/main.yml
index 5b5cb193cc..d20e545e16 100644
--- a/ansible/roles/microshift-start/defaults/main.yml
+++ b/ansible/roles/microshift-start/defaults/main.yml
@@ -12,5 +12,3 @@ du_dirs:
   - /usr/bin/*
 
 sample_interval: 5
-
-vnstat_db: /var/lib/vnstat/vnstat.db
diff --git a/ansible/roles/microshift-start/tasks/main.yml b/ansible/roles/microshift-start/tasks/main.yml
index 3962406931..50a9ea4e4a 100644
--- a/ansible/roles/microshift-start/tasks/main.yml
+++ b/ansible/roles/microshift-start/tasks/main.yml
@@ -34,26 +34,6 @@
     state: stopped
     enabled: no
 
-- name: check for vnstat
-  ansible.builtin.command: rpm -q vnstat
-  register: vnstat_check
-  ignore_errors: true
-
-- name: vnstat cleanup
-  become: yes
-  block:
-  - name: stop & enable vnstat service
-    ansible.builtin.systemd:
-      name: vnstat
-      state: stopped
-      enabled: yes
-
-  - name: delete vnstat db
-    ansible.builtin.file:
-      path: "{{ vnstat_db }}"
-      state: absent
-  when: vnstat_check.rc == 0
-
 - name: create .kube home dir
   ansible.builtin.file:
     path: ~/.kube/
@@ -82,7 +62,7 @@
   block:
   - name: source pbench-agent & register-tool-set
     ansible.builtin.shell: source /etc/profile.d/pbench-agent.sh && pbench-register-tool-set
-  
+
   - name: set new pidstat interval
     ansible.builtin.shell: source /etc/profile.d/pbench-agent.sh && pbench-register-tool --name=pidstat -- --interval={{ sample_interval }}
 
@@ -91,7 +71,7 @@
     async: "{{ pbench_record_duration|int * 2 }}"
     poll: 0
     register: pbench_user_benchmark_result
-  
+
   - name: Pause for 60 seconds to gather steady state for pbench tool recording
     ansible.builtin.pause:
       seconds: 60
@@ -104,6 +84,10 @@
   register: cadvisor_check
   ignore_errors: true
 
+- name: record network measurement start epoch
+  ansible.builtin.command: date +%s
+  register: network_start_epoch
+
 - name: measure microshift service boot time
   include_tasks: roles/common/tasks/boot.yml
   vars:
@@ -138,22 +122,68 @@
   include_tasks: roles/common/tasks/disk.yml
   loop: "{{ du_dirs }}"
 
-- name: vnstat collection tasks
+- name: capture container image sizes
   block:
-    - name: wait for vnstat db to populate
-      ansible.builtin.shell: vnstat | grep today
-      retries: 60
-      delay: 10
-      register: vnstat_db
-      until: vnstat_db.rc == 0
-
-    - name: get vnstat network usage
-      ansible.builtin.command: vnstat
-      register: vnstat
-
-    - name: record network usage to file
+    - name: copy crio-df script to remote host
+      become: yes
+      ansible.builtin.copy:
+        src: roles/common/tasks/files/crio-df.py
+        dest: /tmp/crio-df.py
+        mode: '0755'
+
+    - name: run crio-df script
+      become: yes
+      ansible.builtin.command: python3 /tmp/crio-df.py -v
+      register: crio_df
+
+    - name: save container image sizes to results
+      ansible.builtin.copy:
+        content: "{{ crio_df.stdout }}"
+        dest: "{{ results_dir }}/images.txt"
+      delegate_to: localhost
+
+- name: record network measurement end epoch
+  ansible.builtin.command: date +%s
+  register: network_end_epoch
+
+- name: resolve prometheus network query settings
+  ansible.builtin.set_fact:
+    prom_network_endpoint: >-
+      {{
+        prometheus_query_endpoint | default(
+          'http://' ~
+          (
+            (
+              hostvars[(groups['logging'] | default([]) | first)].ansible_host
+              | default((groups['logging'] | default([]) | first))
+            )
+            if (groups['logging'] | default([]) | length > 0)
+            else 'zen3'
+          )
+          ~ ':' ~ (prometheus_port | default(9091) | string),
+          true
+        )
+      }}
+    prom_network_device: "{{ network_device | default(ansible_default_ipv4.interface | default('enp5s0'), true) }}"
+    prom_network_instance: "{{ inventory_hostname }}:9100"
+  when: prometheus_logging | bool
+
+- name: query prometheus for network transfer
+  block:
+    - name: run prometheus network query
+      ansible.builtin.command: >
+        python3 roles/common/tasks/files/prom-network-query.py
+        --prometheus {{ prom_network_endpoint }}
+        --instance {{ prom_network_instance }}
+        --device {{ prom_network_device }}
+        {{ network_start_epoch.stdout }}
+        {{ network_end_epoch.stdout }}
+      register: prom_network
+      delegate_to: localhost
+
+    - name: save network data to results
       ansible.builtin.copy:
-        content: "{{ vnstat.stdout }}"
+        content: "{{ prom_network.stdout }}"
         dest: "{{ results_dir }}/network.txt"
       delegate_to: localhost
-  when: vnstat_check.rc == 0
+  when: prometheus_logging | bool