# 02 – Explore and Plot SUHS‑MRV UHS Dataset

This notebook provides basic exploratory analysis and plots for the SUHS‑MRV dataset.

It assumes that the generated CSV files already exist under `data/generated/`.


In [None]:
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt


In [None]:
NOTEBOOK_DIR = Path.cwd()
REPO_ROOT = NOTEBOOK_DIR.parent
DATA_DIR = REPO_ROOT / "data" / "generated"

print("Repo root :", REPO_ROOT)
print("Data dir  :", DATA_DIR)


In [None]:
facility_df = pd.read_csv(DATA_DIR / "facility_metadata.csv")
timeseries_df = pd.read_csv(
    DATA_DIR / "facility_timeseries.csv",
    parse_dates=["timestamp"],
)
cycle_summary_df = pd.read_csv(
    DATA_DIR / "cycle_summary.csv",
    parse_dates=["cycle_start", "cycle_end"],
)

facility_df.head()


## Basic summary statistics


In [None]:
print("Number of facilities :", facility_df["facility_id"].nunique())
print("Timeseries rows      :", len(timeseries_df))
print("Cycle summary rows   :", len(cycle_summary_df))

facility_df.describe(include="all").T


## Example: pressure history for one facility


In [None]:
sample_facility = facility_df["facility_id"].iloc[0]
ts = timeseries_df[timeseries_df["facility_id"] == sample_facility].copy()
ts = ts.sort_values("timestamp")

print("Facility:", sample_facility)
ts[["timestamp", "pressure_mpa"]].head()


In [None]:
plt.figure(figsize=(10, 4))
plt.plot(ts["timestamp"], ts["pressure_mpa"])
plt.xlabel("Time")
plt.ylabel("Pressure (MPa)")
plt.title(f"Pressure history – {sample_facility}")
plt.tight_layout()
plt.show()
