-
Notifications
You must be signed in to change notification settings - Fork 224
USHIFT-6637: Ansible: Improve performance metrics capture #6271
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
openshift-merge-bot
merged 4 commits into
openshift:main
from
sjug:ansible_improvements
Feb 26, 2026
Merged
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
1d6ee8a
ready.sh: Add timestamps and not ready pod tracking
sjug d946858
install-microshift: Clean baseline before initial disk measurement
sjug 000caeb
install-logging-exporters: Fix node exporter package and service names
sjug 47823b5
microshift-start: Add Prometheus network and image size metrics
sjug File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,347 @@ | ||
| #!/usr/bin/env python3 | ||
| """ | ||
| CRI-O Storage Disk Usage Reporter | ||
| A replacement for 'podman system df -v' that correctly calculates SharedSize for CRI-O storage. | ||
| Also calculates compressed (download) sizes from locally stored registry manifests. | ||
| """ | ||
|
|
||
| import json | ||
| import os | ||
| import sys | ||
| from collections import defaultdict | ||
| from datetime import datetime, timezone | ||
|
|
||
| # Configuration | ||
| STORAGE_ROOT = "/var/lib/containers/storage" | ||
| IMAGES_DIR = f"{STORAGE_ROOT}/overlay-images" | ||
| IMAGES_JSON = f"{IMAGES_DIR}/images.json" | ||
| LAYERS_JSON = f"{STORAGE_ROOT}/overlay-layers/layers.json" | ||
| VOLATILE_LAYERS_JSON = f"{STORAGE_ROOT}/overlay-layers/volatile-layers.json" | ||
|
|
||
|
|
||
| def load_json_file(filepath): | ||
| """Load and parse a JSON file.""" | ||
| try: | ||
| with open(filepath, 'r') as f: | ||
| return json.load(f) | ||
| except (FileNotFoundError, json.JSONDecodeError): | ||
| return [] | ||
| except Exception as e: | ||
| print(f"Warning: Could not load {filepath}: {e}", file=sys.stderr) | ||
| return [] | ||
|
|
||
|
|
||
| def format_size(bytes_value): | ||
| """Format bytes as human-readable string using decimal (SI) units to match podman.""" | ||
| if bytes_value == 0: | ||
| return "0B" | ||
|
|
||
| units = ['B', 'KB', 'MB', 'GB', 'TB'] | ||
| unit_index = 0 | ||
| size = float(bytes_value) | ||
|
|
||
| # Use 1000 (decimal) instead of 1024 (binary) to match podman's output | ||
| while size >= 1000 and unit_index < len(units) - 1: | ||
| size /= 1000 | ||
| unit_index += 1 | ||
|
|
||
| # Format with appropriate precision | ||
| if unit_index == 0: # Bytes | ||
| return f"{int(size)}{units[unit_index]}" | ||
| elif size >= 100: | ||
| return f"{size:.0f}{units[unit_index]}" | ||
| elif size >= 10: | ||
| return f"{size:.1f}{units[unit_index]}" | ||
| else: | ||
| return f"{size:.2f}{units[unit_index]}" | ||
|
|
||
|
|
||
| def format_time_ago(timestamp): | ||
| """Format timestamp as 'X days/weeks/months ago'.""" | ||
| if not timestamp: | ||
| return "Unknown" | ||
|
|
||
| try: | ||
| # Parse the timestamp | ||
| if isinstance(timestamp, str): | ||
| # Handle ISO format with timezone | ||
| created = datetime.fromisoformat(timestamp.replace('Z', '+00:00')) | ||
| else: | ||
| created = datetime.fromtimestamp(timestamp, tz=timezone.utc) | ||
|
|
||
| now = datetime.now(timezone.utc) | ||
| diff = now - created | ||
|
|
||
| days = diff.days | ||
| if days == 0: | ||
| hours = diff.seconds // 3600 | ||
| if hours == 0: | ||
| return "Just now" | ||
| return f"{hours} hour{'s' if hours > 1 else ''} ago" | ||
| elif days == 1: | ||
| return "Yesterday" | ||
| elif days < 7: | ||
| return f"{days} days ago" | ||
| elif days < 30: | ||
| weeks = days // 7 | ||
| return f"{weeks} week{'s' if weeks > 1 else ''} ago" | ||
| elif days < 365: | ||
| months = days // 30 | ||
| return f"{months} month{'s' if months > 1 else ''} ago" | ||
| else: | ||
| years = days // 365 | ||
| return f"{years} year{'s' if years > 1 else ''} ago" | ||
| except Exception: | ||
| return "Unknown" | ||
|
|
||
|
|
||
| def walk_image_layers(image, layers_by_id): | ||
| """Walk the layer chain from TopLayer up through parents.""" | ||
| layers_walked = [] | ||
| visited = set() | ||
|
|
||
| # Start from TopLayer | ||
| layer_id = image.get("layer") | ||
|
|
||
| # Walk up the parent chain | ||
| while layer_id and layer_id not in visited: | ||
| visited.add(layer_id) | ||
|
|
||
| layer = layers_by_id.get(layer_id) | ||
| if not layer: | ||
| break | ||
|
|
||
| layers_walked.append(layer_id) | ||
| layer_id = layer.get("parent") | ||
|
|
||
| return layers_walked | ||
|
|
||
|
|
||
| def get_image_display_name(image): | ||
| """Get the best display name for an image.""" | ||
| names = image.get("names", []) | ||
| if not names: | ||
| return ("<none>", "<none>") | ||
|
|
||
| # Use the first name | ||
| name = names[0] | ||
|
|
||
| # Remove digest if present | ||
| if "@sha256:" in name: | ||
| name = name.split("@")[0] | ||
|
|
||
| # Split into repository and tag | ||
| if ":" in name: | ||
| parts = name.rsplit(":", 1) | ||
| # If the right side still contains '/', this ':' belongs to a registry | ||
| # host:port segment and the image is untagged. | ||
| if "/" in parts[1]: | ||
| return (name, "latest") | ||
| return (parts[0], parts[1]) | ||
| else: | ||
| return (name, "latest") | ||
sjug marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
|
|
||
| def get_compressed_sizes(images): | ||
| """Read registry manifests from local storage to get compressed layer sizes.""" | ||
| all_layers = {} # digest -> compressed size | ||
| layer_usage = defaultdict(int) | ||
| per_image = [] | ||
|
|
||
| for image in images: | ||
| image_id = image["id"] | ||
| manifest_path = os.path.join(IMAGES_DIR, image_id, "manifest") | ||
| if not os.path.exists(manifest_path): | ||
| continue | ||
|
|
||
| manifest = load_json_file(manifest_path) | ||
| if not manifest: | ||
| continue | ||
|
|
||
| layers = manifest.get("layers", []) | ||
| image_compressed = 0 | ||
| image_layers = [] | ||
|
|
||
| for layer in layers: | ||
| digest = layer.get("digest", "") | ||
| size = layer.get("size", 0) | ||
| image_compressed += size | ||
| image_layers.append({"digest": digest, "size": size}) | ||
| all_layers[digest] = size | ||
| layer_usage[digest] += 1 | ||
|
|
||
| repo, _ = get_image_display_name(image) | ||
| per_image.append({ | ||
| "id": image_id[:12], | ||
| "name": repo, | ||
| "compressed": image_compressed, | ||
| "layers": image_layers, | ||
| }) | ||
|
|
||
| shared_digests = {d for d, c in layer_usage.items() if c > 1} | ||
| return all_layers, layer_usage, shared_digests, per_image | ||
|
|
||
|
|
||
| def main(verbose=False): | ||
| """Main function to display CRI-O storage disk usage.""" | ||
|
|
||
| # Check if running as root | ||
| if os.geteuid() != 0: | ||
| print("Error: This script must be run as root (sudo)", file=sys.stderr) | ||
| sys.exit(1) | ||
|
|
||
| # Load data | ||
| images = load_json_file(IMAGES_JSON) | ||
| layers = load_json_file(LAYERS_JSON) | ||
| volatile_layers = load_json_file(VOLATILE_LAYERS_JSON) | ||
|
|
||
| if not images: | ||
| print("No images found in CRI-O storage") | ||
| return | ||
|
|
||
| # Create layer lookup map | ||
| layers_by_id = {layer["id"]: layer for layer in layers} | ||
|
|
||
| # Count layer usage across all images | ||
| layer_count = defaultdict(int) | ||
| image_layers = {} | ||
|
|
||
| for image in images: | ||
| image_id = image["id"] | ||
| walked_layers = walk_image_layers(image, layers_by_id) | ||
| image_layers[image_id] = walked_layers | ||
|
|
||
| for layer_id in walked_layers: | ||
| layer_count[layer_id] += 1 | ||
|
|
||
| # Calculate total storage first (each layer counted exactly once) | ||
| total_size = 0 | ||
| for layer_id in layer_count.keys(): | ||
| layer = layers_by_id[layer_id] | ||
| diff_size = layer.get("diff-size") | ||
| size = diff_size if diff_size is not None else (layer.get("uncompress_size", 0) or 0) | ||
| total_size += size | ||
sjug marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| # Map image top layers to image IDs for container counting | ||
| layer_to_image = {} | ||
| for img in images: | ||
| layer_to_image[img.get("layer")] = img["id"] | ||
|
|
||
| # Count containers per image using volatile layers (matching podman's method) | ||
| containers_per_image = defaultdict(int) | ||
| for container in volatile_layers: | ||
| parent = container.get("parent") | ||
| if parent in layer_to_image: | ||
| image_id = layer_to_image[parent] | ||
| containers_per_image[image_id] += 1 | ||
|
|
||
| # Calculate sizes for each image | ||
| image_data = [] | ||
| total_reclaimable = 0 | ||
|
|
||
| for image in images: | ||
| image_id = image["id"] | ||
| walked_layers = image_layers[image_id] | ||
|
|
||
| shared_size = 0 | ||
| unique_size = 0 | ||
|
|
||
| for layer_id in walked_layers: | ||
| layer = layers_by_id[layer_id] | ||
| diff_size = layer.get("diff-size") | ||
| size = diff_size if diff_size is not None else (layer.get("uncompress_size", 0) or 0) | ||
|
|
||
| if layer_count[layer_id] > 1: | ||
| shared_size += size | ||
| else: | ||
| unique_size += size | ||
|
|
||
| image_size = shared_size + unique_size | ||
|
|
||
| # Count containers using this image (from volatile layers) | ||
| container_count = containers_per_image.get(image_id, 0) | ||
|
|
||
| # Get image metadata | ||
| repo, tag = get_image_display_name(image) | ||
| created = format_time_ago(image.get("created")) | ||
|
|
||
| image_data.append({ | ||
| "id": image_id[:12], | ||
| "repository": repo, | ||
| "tag": tag, | ||
| "created": created, | ||
| "size": image_size, | ||
| "shared_size": shared_size, | ||
| "unique_size": unique_size, | ||
| "containers": container_count, | ||
| "reclaimable": unique_size if container_count == 0 else 0 | ||
| }) | ||
|
|
||
| if container_count == 0: | ||
| total_reclaimable += unique_size | ||
|
|
||
| # Calculate reclaimable percentage | ||
| reclaimable_pct = (total_reclaimable / total_size * 100) if total_size > 0 else 0 | ||
|
|
||
| # Calculate compressed download sizes | ||
| comp_layers, _, comp_shared, comp_per_image = get_compressed_sizes(images) | ||
| comp_deduplicated = sum(comp_layers.values()) | ||
|
|
||
| if verbose: | ||
| # Detailed output similar to 'podman system df -v' | ||
| print("Images space usage:\n") | ||
| print(f"{'REPOSITORY':<55} {'TAG':<12} {'IMAGE ID':<12} {'CREATED':<10} {'SIZE':<12} {'SHARED SIZE':<12} {'UNIQUE SIZE':<12} {'CONTAINERS'}") | ||
|
|
||
| for img in sorted(image_data, key=lambda x: x["size"], reverse=True): | ||
| print(f"{img['repository']:<55} {img['tag']:<12} {img['id']:<12} {img['created']:<10} " | ||
| f"{format_size(img['size']):<12} {format_size(img['shared_size']):<12} " | ||
| f"{format_size(img['unique_size']):<12} {img['containers']}") | ||
|
|
||
| print("\nContainers space usage:\n") | ||
| print(f"{'CONTAINER ID':<12} {'IMAGE':<35} {'COMMAND':<20} {'LOCAL VOLUMES':<15} {'SIZE':<12} {'CREATED':<12} {'STATUS':<12} {'NAMES'}") | ||
|
|
||
| # Container layers from volatile-layers.json | ||
| for container in volatile_layers[:10]: # Limit to first 10 | ||
| container_id = container.get("id", "")[:12] | ||
| parent = container.get("parent", "") | ||
| image = layer_to_image.get(parent, "")[:12] if parent in layer_to_image else "N/A" | ||
| print(f"{container_id:<12} {image:<35} {'N/A':<20} {'0':<15} {'N/A':<12} {'N/A':<12} {'N/A':<12} {'N/A'}") | ||
|
|
||
| print("\nLocal Volumes space usage:\n") | ||
| print(f"{'VOLUME NAME':<30} {'LINKS':<10} {'SIZE'}") | ||
| # No volume info in CRI-O context | ||
|
|
||
| # Compressed download sizes | ||
| print("\nCompressed download sizes:\n") | ||
| for img in sorted(comp_per_image, key=lambda x: x["compressed"], reverse=True): | ||
| print(f"{img['id']} {format_size(img['compressed']):>10} {img['name']}") | ||
| for layer in img["layers"]: | ||
| marker = " *" if layer["digest"] in comp_shared else "" | ||
| print(f" {layer['digest'][:19]}... {format_size(layer['size']):>10}{marker}") | ||
|
|
||
| else: | ||
| # Summary output similar to 'podman system df' | ||
| print(f"{'TYPE':<15} {'TOTAL':<12} {'ACTIVE':<12} {'SIZE':<15} {'RECLAIMABLE'}") | ||
| print(f"{'Images':<15} {len(images):<12} {len([i for i in image_data if i['containers'] > 0]):<12} " | ||
| f"{format_size(total_size):<15} {format_size(total_reclaimable)} ({reclaimable_pct:.0f}%)") | ||
| print(f"{'Containers':<15} {len(volatile_layers):<12} {'0':<12} {'0B':<15} {'0B (0%)'}") | ||
| print(f"{'Local Volumes':<15} {'0':<12} {'0':<12} {'0B':<15} {'0B (0%)'}") | ||
|
|
||
| # Print summary statistics | ||
| if verbose: | ||
| print("\n" + "="*80) | ||
| print("Storage Summary:") | ||
| print(f" Total Images: {len(images)}") | ||
| print(f" Images with containers: {len([i for i in image_data if i['containers'] > 0])}") | ||
| print(f" Total unique layers: {len(layer_count)}") | ||
| print(f" Shared layers (used by >1 image): {len([lid for lid, c in layer_count.items() if c > 1])}") | ||
| print(f" Total storage used: {format_size(total_size)}") | ||
| print(f" Reclaimable space: {format_size(total_reclaimable)} ({reclaimable_pct:.0f}%)") | ||
| print(f" Compressed download size: {format_size(comp_deduplicated)}") | ||
| print(f" Compression ratio: {total_size / comp_deduplicated:.1f}:1" if comp_deduplicated > 0 else "") | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| # Check for verbose flag | ||
| verbose = "-v" in sys.argv or "--verbose" in sys.argv | ||
| main(verbose) | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.