# 03 – Validate SUHS‑MRV UHS Dataset

This notebook runs a focused set of **sanity and MRV checks** on the generated SUHS‑MRV
dataset. The checks mirror the guidance in `docs/validation_checks.md` and are intended
to be simple, transparent, and easy to adapt.

It assumes that the following files already exist under `data/generated/`:

- `facility_metadata.csv`
- `facility_timeseries.csv`
- `cycle_summary.csv`


In [None]:
from pathlib import Path
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Paths
NOTEBOOK_DIR = Path.cwd()
REPO_ROOT = NOTEBOOK_DIR.parent
DATA_DIR = REPO_ROOT / 'data' / 'generated'
sys.path.append(str(REPO_ROOT))

print('Repo root:', REPO_ROOT)
print('Data dir :', DATA_DIR)

In [None]:
# Load generated CSVs
facility_df = pd.read_csv(DATA_DIR / 'facility_metadata.csv')
timeseries_df = pd.read_csv(
    DATA_DIR / 'facility_timeseries.csv',
    parse_dates=['timestamp'],
)
cycle_summary_df = pd.read_csv(
    DATA_DIR / 'cycle_summary.csv',
    parse_dates=['cycle_start', 'cycle_end'],
)

len(facility_df), len(timeseries_df), len(cycle_summary_df)

## 1. Presence and basic row‑count sanity

In [None]:
print('Unique facilities in metadata :', facility_df['facility_id'].nunique())
print('Unique facilities in timeseries:', timeseries_df['facility_id'].nunique())
print('Unique facilities in cycles    :', cycle_summary_df['facility_id'].nunique())

assert facility_df['facility_id'].nunique() == timeseries_df['facility_id'].nunique()
assert facility_df['facility_id'].nunique() == cycle_summary_df['facility_id'].nunique()

print('OK: facility_id uniqueness is consistent across all three files.')

## 2. Required columns and null checks

In [None]:
# Key columns in facility metadata
required_facility_cols = ['facility_id', 'facility_type', 'depth_m']
missing_cols = [c for c in required_facility_cols if c not in facility_df.columns]
print('Required facility columns:', required_facility_cols)
print('Missing columns           :', missing_cols)

if missing_cols:
    raise ValueError(f'Missing required columns in facility_metadata.csv: {missing_cols}')

nulls = facility_df[required_facility_cols].isnull().sum()
print('\nNull counts for key facility columns:')
print(nulls)

## 3. Simple mass‑balance view for one facility

In [None]:
sample_facility = facility_df['facility_id'].iloc[0]
sample_ts = timeseries_df[timeseries_df['facility_id'] == sample_facility].copy()
sample_ts = sample_ts.sort_values('timestamp')

cols = [
    'h2_working_gas_kg',
    'h2_injected_kg',
    'h2_withdrawn_kg',
    'h2_losses_total_kg',
]
present = [c for c in cols if c in sample_ts.columns]
print('Facility:', sample_facility)
print('Columns available for mass‑balance view:', present)

sample_ts[present].head()

## 4. MRV residual distribution

In [None]:
if 'mrv_residual' in timeseries_df.columns:
    resid = timeseries_df['mrv_residual'].astype(float)
    print('Residual min / max / mean:', resid.min(), resid.max(), resid.mean())

    plt.figure(figsize=(8, 4))
    plt.hist(resid, bins=50)
    plt.xlabel('MRV residual')
    plt.ylabel('Count')
    plt.title('Distribution of MRV residuals')
    plt.tight_layout()
    plt.show()
else:
    print("Column 'mrv_residual' not found in timeseries.")

## 5. Pressure and temperature envelope sanity

In [None]:
p_min = timeseries_df['pressure_mpa'].min()
p_max = timeseries_df['pressure_mpa'].max()
t_min = timeseries_df['temperature_c'].min()
t_max = timeseries_df['temperature_c'].max()

print('Pressure MPa range   :', p_min, '→', p_max)
print('Temperature °C range :', t_min, '→', t_max)

## 6. Per‑facility basic health summary

In [None]:
per_facility = timeseries_df.groupby('facility_id').agg(
    n_rows=('timestamp', 'size'),
    min_pressure=('pressure_mpa', 'min'),
    max_pressure=('pressure_mpa', 'max'),
    min_temp=('temperature_c', 'min'),
    max_temp=('temperature_c', 'max'),
)
per_facility.head()