# 03 – Validate SUHS‑MRV UHS Dataset

This notebook runs a small set of sanity checks against the generated dataset.
The checks mirror the recommendations in `docs/validation_checks.md`.

In [None]:
from pathlib import Path

import pandas as pd

In [None]:
NOTEBOOK_DIR = Path.cwd()
REPO_ROOT = NOTEBOOK_DIR.parent
DATA_DIR = REPO_ROOT / 'data' / 'generated'

facility_df = pd.read_csv(DATA_DIR / 'facility_metadata.csv')
timeseries_df = pd.read_csv(
    DATA_DIR / 'facility_timeseries.csv',
    parse_dates=['timestamp'],
)
cycle_summary_df = pd.read_csv(
    DATA_DIR / 'cycle_summary.csv',
    parse_dates=['cycle_start', 'cycle_end'],
)

(len(facility_df), len(timeseries_df), len(cycle_summary_df))

## 1. Presence and basic row‑count sanity

In [None]:
print('Unique facilities in metadata :', facility_df['facility_id'].nunique())
print('Unique facilities in timeseries:', timeseries_df['facility_id'].nunique())
print('Unique facilities in cycles    :', cycle_summary_df['facility_id'].nunique())

assert facility_df['facility_id'].nunique() == timeseries_df['facility_id'].nunique()
assert facility_df['facility_id'].nunique() == cycle_summary_df['facility_id'].nunique()

print('OK: facility_id uniqueness is consistent across all three files.')

## 2. Required columns and null checks

In [None]:
required_facility_cols = ['facility_id', 'facility_type', 'depth_m']

missing_cols = [c for c in required_facility_cols if c not in facility_df.columns]
if missing_cols:
    print('Missing required columns in facility_metadata.csv:', missing_cols)
else:
    print('All required facility columns present.')

In [None]:
nulls = facility_df[required_facility_cols].isnull().sum()
print('Null counts for key facility columns:')
print(nulls)

## 3. Simple mass‑balance view for one facility

In [None]:
sample_facility = facility_df['facility_id'].iloc[0]
sample_ts = timeseries_df[timeseries_df['facility_id'] == sample_facility].copy()

cols = [
    'h2_working_gas_kg',
    'h2_injected_kg',
    'h2_withdrawn_kg',
    'h2_losses_total_kg',
]
present = [c for c in cols if c in sample_ts.columns]
print('Columns available for mass‑balance view:', present)

sample_ts[present].head()