# Monitoring & Observability Tutorial



## Metrics Categories

### 1. GPU Metrics
- `ml_platform_gpu_utilization_percent`
- `ml_platform_gpu_temperature_celsius`
- `ml_platform_gpu_memory_used_bytes`
- `ml_platform_gpu_memory_total_bytes`

### 2. Job Metrics
- `ml_platform_jobs_submitted_total`
- `ml_platform_jobs_completed_total`
- `ml_platform_jobs_failed_total`
- `ml_platform_job_duration_seconds`

### 3. Queue Metrics
- `ml_platform_queue_depth`
- `ml_platform_active_workers_total`
- `ml_platform_queue_wait_time_seconds`

In [23]:
import sys
sys.path.insert(0, '..')

from src.monitoring.metrics import MetricsCollector, get_metrics_collector
from src.monitoring.logger import setup_logging
from src.resources.gpu_manager import get_gpu_manager
from src.resources.health_checker import get_health_checker, HealthThresholds
from src.scheduler.job_queue import get_job_queue
import json
import logging

setup_logging(level="INFO")  
logger = logging.getLogger(__name__)
print("✓ Setup complete!")


2025-10-09 21:34:03 - root - [32mINFO[0m - Logging initialized at level INFO [logger.py:202]
✓ Setup complete!


## 1. Metrics Collection

In [5]:
# Get metrics collector
metrics = get_metrics_collector()

print("Available Metrics:")
print("\n📊 GPU Metrics:")
print("  - ml_platform_gpu_utilization_percent")
print("  - ml_platform_gpu_temperature_celsius")
print("  - ml_platform_gpu_memory_used_bytes")
print("  - ml_platform_gpu_memory_total_bytes")

print("\n📈 Job Metrics:")
print("  - ml_platform_jobs_submitted_total (counter)")
print("  - ml_platform_jobs_completed_total (counter)")
print("  - ml_platform_jobs_failed_total (counter)")
print("  - ml_platform_job_duration_seconds (histogram)")

print("\n📋 Queue Metrics:")
print("  - ml_platform_queue_depth (gauge)")
print("  - ml_platform_active_workers_total (gauge)")
print("  - ml_platform_queue_wait_time_seconds (histogram)")

2025-10-09 20:59:50 - src.resources.health_checker - [32mINFO[0m - Health checker initialized with 60s interval [health_checker.py:82]
2025-10-09 20:59:50 - src.scheduler.priority_manager - [32mINFO[0m - Priority manager initialized (fair_share=True, starvation_timeout=3600s) [priority_manager.py:85]
2025-10-09 20:59:50 - src.monitoring.metrics - [32mINFO[0m - Metrics collector initialized [metrics.py:176]
Available Metrics:

📊 GPU Metrics:
  - ml_platform_gpu_utilization_percent
  - ml_platform_gpu_temperature_celsius
  - ml_platform_gpu_memory_used_bytes
  - ml_platform_gpu_memory_total_bytes

📈 Job Metrics:
  - ml_platform_jobs_submitted_total (counter)
  - ml_platform_jobs_completed_total (counter)
  - ml_platform_jobs_failed_total (counter)
  - ml_platform_job_duration_seconds (histogram)

📋 Queue Metrics:
  - ml_platform_queue_depth (gauge)
  - ml_platform_active_workers_total (gauge)
  - ml_platform_queue_wait_time_seconds (histogram)


## 2. GPU Monitoring

In [6]:
# Get GPU manager
gpu_manager = get_gpu_manager()

if gpu_manager.num_gpus > 0:
    print(f"Monitoring {gpu_manager.num_gpus} GPUs:\n")
    
    for gpu_info in gpu_manager.get_all_gpu_info():
        print(f"GPU {gpu_info.id}:")
        print(f"  Name: {gpu_info.name}")
        print(f"  Utilization: {gpu_info.utilization:.1f}%")
        print(f"  Memory: {gpu_info.used_memory/(1024**3):.1f}GB / {gpu_info.total_memory/(1024**3):.1f}GB")
        print(f"  Temperature: {gpu_info.temperature:.1f}°C")
        
        # Record metrics
        metrics.record_gpu_utilization(gpu_info.id, gpu_info.utilization)
        metrics.record_gpu_temperature(gpu_info.id, gpu_info.temperature)
        metrics.record_gpu_memory(gpu_info.id, gpu_info.used_memory, gpu_info.total_memory)
        print("  ✓ Metrics recorded\n")
else:
    print("No GPUs available (CPU mode)")

No GPUs available (CPU mode)


## 3. Health Checks

In [8]:
# Get health checker
health_checker = get_health_checker()
thresholds = HealthThresholds()

print("Health Check Thresholds:")
print(f"  Max GPU Temperature: {thresholds.max_temperature}°C")
print(f"  Max GPU Memory: {thresholds.max_memory_percent}%")
print(f"  Check Interval: 60 seconds")
print(f"  Consecutive Failure Threshold: 3\n")

# Get health summary
summary = health_checker.get_health_summary()

print("Health Summary:")
print(f"  Healthy: {summary['healthy']}")
print(f"  Degraded: {summary['degraded']}")
print(f"  Unhealthy: {summary['unhealthy']}")
print(f"  Unknown: {summary['unknown']}")

Health Check Thresholds:
  Max GPU Temperature: 85.0°C
  Max GPU Memory: 95.0%
  Check Interval: 60 seconds
  Consecutive Failure Threshold: 3

Health Summary:
  Healthy: 0
  Degraded: 0
  Unhealthy: 0
  Unknown: 0


## 4. Job Metrics

In [18]:
# Simulate job lifecycle metrics

# Job submitted
metrics.record_job_submitted(
    user_id="demo-user",
    priority="MEDIUM"
)
print("✓ Job submission recorded")

# Job started (queue wait time)
queue_wait_seconds = 45
metrics.record_job_wait_time(priority="MEDIUM", wait_time=queue_wait_seconds)
print(f"✓ Queue wait time recorded: {queue_wait_seconds}s")

# Job completed
job_duration_seconds = 1800  # 30 minutes
metrics.record_job_completed(
    user_id="demo-user",
    status="success",
    duration=job_duration_seconds,
    job_type="training",
    num_gpus=2
)
print(f"✓ Job completion recorded: {job_duration_seconds}s (30 min)\n")

# Get job queue stats
job_queue = get_job_queue()
stats = job_queue.get_queue_stats()


✓ Job submission recorded
✓ Queue wait time recorded: 45s
✓ Job completion recorded: 1800s (30 min)



## 5. Structured Logging

In [20]:
# Get logger
import logging
logger = logging.getLogger(__name__)

# Log with structured context
logger.info(
    "Job started",
    extra={
        "job_id": "demo-job-001",
        "user_id": "demo-user",
        "num_gpus": 2,
        "priority": "MEDIUM"
    }
)

logger.info(
    "Job progress",
    extra={
        "job_id": "demo-job-001",
        "progress": 50.0,
        "current_loss": 0.342,
        "epoch": 1
    }
)

logger.info(
    "Job completed",
    extra={
        "job_id": "demo-job-001",
        "duration_seconds": 1800,
        "final_loss": 0.125,
        "status": "success"
    }
)

print("\n✓ Structured logs written (check logs directory)")
print("  Format: JSON with timestamp, level, message, context")

2025-10-09 21:07:22 - __main__ - [32mINFO[0m - Job started [602018948.py:6]
2025-10-09 21:07:22 - __main__ - [32mINFO[0m - Job progress [602018948.py:16]
2025-10-09 21:07:22 - __main__ - [32mINFO[0m - Job completed [602018948.py:26]

✓ Structured logs written (check logs directory)
  Format: JSON with timestamp, level, message, context


## 6. Alert Rules (Kubernetes)

In production, these alerts are defined in `k8s/servicemonitor.yaml`.

In [21]:
print("Production Alert Rules:\n")

alerts = [
    {
        "name": "GPUTemperatureHigh",
        "condition": "gpu_temperature_celsius > 80",
        "for": "5m",
        "severity": "warning",
        "action": "Slack notification + page oncall"
    },
    {
        "name": "GPUMemoryExhausted",
        "condition": "(gpu_memory_used / gpu_memory_total) > 0.95",
        "for": "5m",
        "severity": "warning",
        "action": "Slack notification"
    },
    {
        "name": "QueueDepthHigh",
        "condition": "queue_depth > 100",
        "for": "10m",
        "severity": "warning",
        "action": "Auto-scale workers"
    },
    {
        "name": "WorkerPodDown",
        "condition": "up{job='ml-worker'} == 0",
        "for": "5m",
        "severity": "critical",
        "action": "Page oncall immediately"
    },
    {
        "name": "HighJobFailureRate",
        "condition": "(job_failures / job_submissions) > 0.5",
        "for": "10m",
        "severity": "warning",
        "action": "Investigate worker health"
    }
]

for alert in alerts:
    print(f"📢 {alert['name']}")
    print(f"   Condition: {alert['condition']}")
    print(f"   For: {alert['for']}")
    print(f"   Severity: {alert['severity']}")
    print(f"   Action: {alert['action']}\n")

Production Alert Rules:

📢 GPUTemperatureHigh
   Condition: gpu_temperature_celsius > 80
   For: 5m
   Action: Slack notification + page oncall

📢 GPUMemoryExhausted
   Condition: (gpu_memory_used / gpu_memory_total) > 0.95
   For: 5m
   Action: Slack notification

📢 QueueDepthHigh
   Condition: queue_depth > 100
   For: 10m
   Action: Auto-scale workers

📢 WorkerPodDown
   Condition: up{job='ml-worker'} == 0
   For: 5m
   Severity: critical
   Action: Page oncall immediately

📢 HighJobFailureRate
   Condition: (job_failures / job_submissions) > 0.5
   For: 10m
   Action: Investigate worker health

