# 03 – Validate SUHS-MRV UHS Dataset

This notebook runs a small set of sanity checks against the generated dataset.

The checks mirror the recommendations in `docs/validation_checks.md` and
        `docs/evaluation_metrics.md` and can be used to regenerate
        `data/examples/example_validation_checks_output.txt`.


In [ ]:
from pathlib import Path

import pandas as pd


In [ ]:
NOTEBOOK_DIR = Path.cwd()
REPO_ROOT = NOTEBOOK_DIR.parent
DATA_DIR = REPO_ROOT / "data" / "generated"
EXAMPLES_DIR = REPO_ROOT / "data" / "examples"

print("Repo root   :", REPO_ROOT)
print("Data dir    :", DATA_DIR)
print("Examples dir:", EXAMPLES_DIR)


In [ ]:
facility_df = pd.read_csv(DATA_DIR / "facility_metadata.csv")
timeseries_df = pd.read_csv(
    DATA_DIR / "facility_timeseries.csv",
    parse_dates=["timestamp"],
)
cycle_summary_df = pd.read_csv(
    DATA_DIR / "cycle_summary.csv",
    parse_dates=["cycle_start", "cycle_end"],
)

(len(facility_df), len(timeseries_df), len(cycle_summary_df))


## 1. Presence and basic row-count consistency

We verify that:
- All three files are non-empty.
- `facility_id` coverage is consistent across metadata, timeseries and cycle summary.


In [ ]:
print("Unique facilities in metadata :", facility_df["facility_id"].nunique())
print("Unique facilities in timeseries:", timeseries_df["facility_id"].nunique())
print("Unique facilities in cycles    :", cycle_summary_df["facility_id"].nunique())

assert facility_df["facility_id"].nunique() == timeseries_df["facility_id"].nunique()
assert facility_df["facility_id"].nunique() == cycle_summary_df["facility_id"].nunique()

print("OK: facility_id uniqueness is consistent across all three files.")


## 2. Required columns and null checks

We check that key columns exist and that they do not contain unexpected null values.


In [ ]:
required_facility_cols = ["facility_id", "facility_type", "depth_m"]
missing_facility_cols = [c for c in required_facility_cols if c not in facility_df.columns]

if missing_facility_cols:
    print("Missing required columns in facility_metadata.csv:", missing_facility_cols)
else:
    print("All required facility columns present.")

nulls_facility = facility_df[required_facility_cols].isnull().sum()
print("Null counts for key facility columns:")
print(nulls_facility)


In [ ]:
required_ts_cols = [
    "facility_id",
    "timestamp",
    "pressure_mpa",
    "temperature_c",
]
missing_ts_cols = [c for c in required_ts_cols if c not in timeseries_df.columns]

if missing_ts_cols:
    print("Missing required columns in facility_timeseries.csv:", missing_ts_cols)
else:
    print("All required timeseries columns present.")

nulls_ts = timeseries_df[required_ts_cols].isnull().sum()
print("Null counts for key timeseries columns:")
print(nulls_ts)


In [ ]:
required_cycle_cols = [
    "facility_id",
    "cycle_index",
    "cycle_start",
    "cycle_end",
]
missing_cycle_cols = [c for c in required_cycle_cols if c not in cycle_summary_df.columns]

if missing_cycle_cols:
    print("Missing required columns in cycle_summary.csv:", missing_cycle_cols)
else:
    print("All required cycle summary columns present.")

nulls_cycle = cycle_summary_df[required_cycle_cols].isnull().sum()
print("Null counts for key cycle summary columns:")
print(nulls_cycle)


## 3. Simple physical range checks

We perform basic physical sanity checks on depth, pressure and temperature ranges.
        The exact thresholds are derived from the configuration in `config/uhs_config.yaml`.


In [ ]:
print("Depth range (m):", facility_df["depth_m"].min(), "→", facility_df["depth_m"].max())

if (facility_df["depth_m"] <= 0).any():
    print("WARNING: Some facilities have non-positive depth.")
else:
    print("OK: All facilities have positive depth.")

print("Pressure range (MPa):", timeseries_df["pressure_mpa"].min(), "→", timeseries_df["pressure_mpa"].max())
print("Temperature range (°C):", timeseries_df["temperature_c"].min(), "→", timeseries_df["temperature_c"].max())


## 4. Mass-balance residual statistics

If the `mass_balance_residual` column is present in the timeseries, we compute
        high-level statistics and count how many rows exceed a small threshold.


In [ ]:
if "mass_balance_residual" in timeseries_df.columns:
    residual = timeseries_df["mass_balance_residual"].dropna()
    print("Residual min / max:", float(residual.min()), "/", float(residual.max()))
    print("Residual mean      :", float(residual.mean()))

    high = (residual.abs() > 1e-3).sum()
    print("Rows with |residual| > 1e-3:", int(high))
else:
    print("Column 'mass_balance_residual' not present in timeseries data.")


## 5. Facility-level aggregation of residual anomalies

We compute, for each facility, how many rows exceed the residual threshold.
        This can highlight a small set of facilities with unusual behavior.


In [ ]:
if "mass_balance_residual" in timeseries_df.columns:
    threshold = 1e-3
    ts = timeseries_df.copy()
    ts["high_residual"] = ts["mass_balance_residual"].abs() > threshold

    facility_anomalies = ts.groupby("facility_id")["high_residual"].sum().reset_index()
    facility_anomalies = facility_anomalies.rename(
        columns={"high_residual": "high_residual_row_count"}
    ).sort_values("high_residual_row_count", ascending=False)

    facility_anomalies.head()
else:
    print("Cannot compute facility-level anomalies: 'mass_balance_residual' not present.")


## 6. Write summary to `data/examples/example_validation_checks_output.txt`

This creates a simple human-readable summary file with the main validation outcomes.


In [ ]:
EXAMPLES_DIR.mkdir(parents=True, exist_ok=True)
summary_path = EXAMPLES_DIR / "example_validation_checks_output.txt"

lines = []
lines.append("SUHS-MRV UHS Dataset – Example Validation Summary\n")
lines.append("\n")
lines.append(f"Number of facilities             : {facility_df['facility_id'].nunique()}\n")
lines.append(f"Timeseries rows                  : {len(timeseries_df)}\n")
lines.append(f"Cycle summary rows               : {len(cycle_summary_df)}\n")
lines.append("\n")
lines.append("Required column checks completed for:\n")
lines.append("  - facility_metadata.csv\n")
lines.append("  - facility_timeseries.csv\n")
lines.append("  - cycle_summary.csv\n")
lines.append("\n")
if "mass_balance_residual" in timeseries_df.columns:
    residual = timeseries_df["mass_balance_residual"].dropna()
    lines.append("Mass-balance residual statistics:\n")
    lines.append(f"  min   : {float(residual.min()):.3e}\n")
    lines.append(f"  max   : {float(residual.max()):.3e}\n")
    lines.append(f"  mean  : {float(residual.mean()):.3e}\n")
    high = (residual.abs() > 1e-3).sum()
    lines.append(f"  rows with |residual| > 1e-3 : {int(high)}\n")
else:
    lines.append("Mass-balance residual column not present; residual checks skipped.\n")

with summary_path.open("w", encoding="utf-8") as f:
    f.writelines(lines)

summary_path
