# 03 – Validate SUHS‑MRV UHS Dataset

This notebook runs a focused set of validation checks against the **SUHS‑MRV v2.0** dataset.

The checks mirror the recommendations in `docs/validation_checks.md` and are intended as a
quick sanity pass for:

- file presence and basic row counts  
- facility ID consistency across files  
- required columns and null checks  
- MRV mass‑balance residual behavior  
- purity / pressure / temperature ranges  


In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

In [None]:
# Paths
NOTEBOOK_DIR = Path.cwd()
REPO_ROOT = NOTEBOOK_DIR.parent
DATA_DIR = REPO_ROOT / "data" / "generated"

print("Notebook dir:", NOTEBOOK_DIR)
print("Repo root   :", REPO_ROOT)
print("Data dir    :", DATA_DIR)


In [None]:
# Load core CSVs
facility_df = pd.read_csv(DATA_DIR / "facility_metadata.csv")

timeseries_df = pd.read_csv(
    DATA_DIR / "facility_timeseries.csv",
    parse_dates=["timestamp"],
)

cycle_summary_df = pd.read_csv(
    DATA_DIR / "cycle_summary.csv",
    parse_dates=["cycle_start", "cycle_end"],
)

print("facility_metadata shape  :", facility_df.shape)
print("facility_timeseries shape:", timeseries_df.shape)
print("cycle_summary shape      :", cycle_summary_df.shape)

## 1. Presence and basic row‑count sanity

In [None]:
print("Unique facilities in metadata :", facility_df["facility_id"].nunique())
print("Unique facilities in timeseries:", timeseries_df["facility_id"].nunique())
print("Unique facilities in cycles    :", cycle_summary_df["facility_id"].nunique())

assert facility_df["facility_id"].nunique() == timeseries_df["facility_id"].nunique()
assert facility_df["facility_id"].nunique() == cycle_summary_df["facility_id"].nunique()

print("OK: facility_id uniqueness is consistent across all three files.")

## 2. Required columns and null checks

In [None]:
required_facility_cols = [
    "facility_id",
    "facility_type",
    "depth_m",
]

missing_cols = [c for c in required_facility_cols if c not in facility_df.columns]
if missing_cols:
    print("Missing required columns in facility_metadata.csv:", missing_cols)
else:
    print("All required facility columns present.")

nulls = facility_df[required_facility_cols].isnull().sum()
print("\nNull counts for key facility columns:")
print(nulls)

## 3. Mass‑balance residual behavior

In [None]:
if "mass_balance_residual" in timeseries_df.columns:
    residual = timeseries_df["mass_balance_residual"].astype(float)
    print(residual.describe())

    max_abs = residual.abs().max()
    print("\nMax |mass_balance_residual|:", max_abs)

    high_1e3 = (residual.abs() > 1e-3).sum()
    high_5e3 = (residual.abs() > 5e-3).sum()
    print("Rows with |residual| > 1e-3:", high_1e3)
    print("Rows with |residual| > 5e-3:", high_5e3)

    assert max_abs < 1e-2, "Residuals look too large for a well‑behaved synthetic dataset."
else:
    print("Column 'mass_balance_residual' not found in timeseries_df.")

## 4. Simple mass‑balance view for one facility

In [None]:
sample_facility = facility_df["facility_id"].iloc[0]
sample_ts = timeseries_df[timeseries_df["facility_id"] == sample_facility].copy()

# Columns available in the v2.0 schema
cols = [
    "working_gas_kg",
    "h2_injected_kg",
    "h2_withdrawn_kg",
    "static_losses_kg",
    "dynamic_losses_kg",
]

present = [c for c in cols if c in sample_ts.columns]
print("Columns available for mass‑balance view:", present)

if "static_losses_kg" in present and "dynamic_losses_kg" in present:
    sample_ts["total_losses_kg"] = (
        sample_ts["static_losses_kg"] + sample_ts["dynamic_losses_kg"]
    )

display_cols = [c for c in ["working_gas_kg", "h2_injected_kg", "h2_withdrawn_kg", "total_losses_kg"] if c in sample_ts.columns]
sample_ts[display_cols].head()

## 5. Purity and physical range checks

In [None]:
for col in [
    "h2_inlet_purity_pct",
    "h2_outlet_purity_pct",
    "h2_working_purity_pct",
]:

    if col not in timeseries_df.columns:
        print(f"{col}: not present – skipping")
        continue

    series = timeseries_df[col].astype(float)
    print(f"\n{col}:")
    print(series.describe())

    below_90 = (series < 90).sum()
    above_100 = (series > 100).sum()

    print("  < 90% :", below_90)
    print("  > 100%:", above_100)

## 6. Summary

In [None]:
print("Validation notebook completed. If no assertions failed above,")
print("the SUHS‑MRV v2.0 dataset passed the core sanity checks in this notebook.")