# 02 – Explore and Plot SUHS-MRV UHS Dataset

This notebook provides basic exploratory analysis and plots for the SUHS-MRV dataset.

It assumes that the generated CSV files already exist under `data/generated/`.


In [ ]:
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt


In [ ]:
NOTEBOOK_DIR = Path.cwd()
REPO_ROOT = NOTEBOOK_DIR.parent
DATA_DIR = REPO_ROOT / "data" / "generated"

print("Repo root   :", REPO_ROOT)
print("Data dir    :", DATA_DIR)


In [ ]:
facility_df = pd.read_csv(DATA_DIR / "facility_metadata.csv")
timeseries_df = pd.read_csv(
    DATA_DIR / "facility_timeseries.csv",
    parse_dates=["timestamp"],
)
cycle_summary_df = pd.read_csv(
    DATA_DIR / "cycle_summary.csv",
    parse_dates=["cycle_start", "cycle_end"],
)

facility_df.head()


## Basic summary statistics


In [ ]:
print("Number of facilities :", facility_df["facility_id"].nunique())
print("Timeseries rows      :", len(timeseries_df))
print("Cycle summary rows   :", len(cycle_summary_df))

facility_df.describe(include="all").T


## Example: pressure and temperature for a single facility


In [ ]:
sample_facility = facility_df["facility_id"].iloc[0]
sample_ts = timeseries_df[timeseries_df["facility_id"] == sample_facility].copy()
sample_ts = sample_ts.sort_values("timestamp")

print("Facility:", sample_facility)
sample_ts[["timestamp", "pressure_mpa", "temperature_c"]].head()


In [ ]:
plt.figure(figsize=(10, 4))
plt.plot(sample_ts["timestamp"], sample_ts["pressure_mpa"])
plt.xlabel("Time")
plt.ylabel("Pressure (MPa)")
plt.title(f"Pressure history – facility {sample_facility}")
plt.tight_layout()
plt.show()


In [ ]:
plt.figure(figsize=(10, 4))
plt.plot(sample_ts["timestamp"], sample_ts["temperature_c"])
plt.xlabel("Time")
plt.ylabel("Temperature (°C)")
plt.title(f"Temperature history – facility {sample_facility}")
plt.tight_layout()
plt.show()


## Working-gas capacity distribution


In [ ]:
if "working_gas_capacity_kg" in facility_df.columns:
    facility_df["working_gas_capacity_kg"].hist(bins=20)
    plt.xlabel("Working gas capacity (kg)")
    plt.ylabel("Count")
    plt.title("Distribution of working gas capacity across facilities")
    plt.tight_layout()
    plt.show()
else:
    print("Column 'working_gas_capacity_kg' not found in facility_metadata.csv")


## Facility-level filtering and operational plots

The following cells show how to filter the time series for a single facility and create
basic operational plots (pressure, injection/withdrawal, residuals, purity).


In [ ]:
facility_id = facility_df["facility_id"].iloc[0]  # or set to a specific ID, e.g. "UHS_001"

ts_f = timeseries_df[timeseries_df["facility_id"] == facility_id].copy()
ts_f = ts_f.sort_values("timestamp")

print("Selected facility:", facility_id)
ts_f.head()


In [ ]:
plt.figure(figsize=(12, 5))
plt.plot(ts_f["timestamp"], ts_f["pressure_mpa"])
plt.title(f"Pressure (MPa) vs Time – facility {facility_id}")
plt.xlabel("Date")
plt.ylabel("Pressure (MPa)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [ ]:
plt.figure(figsize=(12, 5))
plt.plot(ts_f["timestamp"], ts_f["h2_injected_kg"], label="Injected")
plt.plot(ts_f["timestamp"], ts_f["h2_withdrawn_kg"], label="Withdrawn")
plt.legend()
plt.title(f"Injection / Withdrawal Profile – facility {facility_id}")
plt.xlabel("Date")
plt.ylabel("Mass (kg)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## Mass-balance residual and purity checks


In [ ]:
if "mass_balance_residual" in ts_f.columns:
    high_residuals = ts_f[ts_f["mass_balance_residual"] > 1e-3]
    print("Number of high-residual rows:", len(high_residuals))
    display(high_residuals[["timestamp", "mass_balance_residual"]].head())
else:
    print("Column 'mass_balance_residual' not found in timeseries data.")


In [ ]:
if "mass_balance_residual" in ts_f.columns:
    plt.figure(figsize=(8, 5))
    ts_f["mass_balance_residual"].hist(bins=50)
    plt.title(f"Mass-balance residual distribution – facility {facility_id}")
    plt.xlabel("Residual (fraction of capacity)")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()
else:
    print("Cannot plot residuals: column 'mass_balance_residual' not present.")


In [ ]:
if "h2_working_purity_pct" in ts_f.columns:
    plt.figure(figsize=(12, 5))
    plt.plot(ts_f["timestamp"], ts_f["h2_working_purity_pct"])
    plt.title(f"Working Gas Purity (%) Over Time – facility {facility_id}")
    plt.xlabel("Date")
    plt.ylabel("Purity (%)")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("Column 'h2_working_purity_pct' not found in timeseries data.")


In [ ]:
if "h2_working_purity_pct" in ts_f.columns:
    impurity_df = ts_f[ts_f["h2_working_purity_pct"] < 98.0]
    print("Rows with working-gas purity < 98%:")
    display(impurity_df[["timestamp", "h2_working_purity_pct"]].head())
else:
    print("Cannot compute impurity_df: column 'h2_working_purity_pct' not present.")
