# 02 – Explore and Plot SUHS‑MRV UHS Dataset

This notebook provides basic exploratory analysis and plots for the SUHS‑MRV dataset.

It assumes that the generated CSV files already exist under `data/generated/`.

In [None]:
import sys
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt

In [None]:
NOTEBOOK_DIR = Path.cwd()
REPO_ROOT = NOTEBOOK_DIR.parent
DATA_DIR = REPO_ROOT / 'data' / 'generated'

print('Repo root   :', REPO_ROOT)
print('Data dir    :', DATA_DIR)

In [None]:
facility_df = pd.read_csv(DATA_DIR / 'facility_metadata.csv')
timeseries_df = pd.read_csv(
    DATA_DIR / 'facility_timeseries.csv',
    parse_dates=['timestamp'],
)
cycle_summary_df = pd.read_csv(
    DATA_DIR / 'cycle_summary.csv',
    parse_dates=['cycle_start', 'cycle_end'],
)

facility_df.head()

## Basic summary statistics

In [None]:
print('Number of facilities :', facility_df['facility_id'].nunique())
print('Timeseries rows      :', len(timeseries_df))
print('Cycle summary rows   :', len(cycle_summary_df))

facility_df.describe(include='all').T

## Example: pressure and temperature for a single facility

In [None]:
sample_facility = facility_df['facility_id'].iloc[0]
sample_ts = timeseries_df[timeseries_df['facility_id'] == sample_facility].copy()
sample_ts = sample_ts.sort_values('timestamp')

print('Facility:', sample_facility)
sample_ts[['timestamp', 'pressure_mpa', 'temperature_c']].head()

In [None]:
plt.figure(figsize=(10, 4))
plt.plot(sample_ts['timestamp'], sample_ts['pressure_mpa'])
plt.xlabel('Time')
plt.ylabel('Pressure (MPa)')
plt.title(f'Pressure history – facility {sample_facility}')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 4))
plt.plot(sample_ts['timestamp'], sample_ts['temperature_c'])
plt.xlabel('Time')
plt.ylabel('Temperature (°C)')
plt.title(f'Temperature history – facility {sample_facility}')
plt.tight_layout()
plt.show()

## Working‑gas capacity distribution

In [None]:
if 'working_gas_capacity_kg' in facility_df.columns:
    facility_df['working_gas_capacity_kg'].hist(bins=20)
    plt.xlabel('Working gas capacity (kg)')
    plt.ylabel('Count')
    plt.title('Distribution of working gas capacity across facilities')
    plt.tight_layout()
    plt.show()
else:
    print("Column 'working_gas_capacity_kg' not found in facility_metadata.csv")