# 04 – Harmonize Data for MVP Dashboard

**MVP-specific notebook**: This creates the curated/merged dataset for the Streamlit dashboard.

**What this step does:**
- **Aggregates emissions**: Pivots emissions by sector and calculates total emissions per region-year
- **Pivots health metrics**: Transforms respiratory health data from long to wide format (one column per metric)
- **Merges all datasets**: Combines emissions, health, and population on `nuts_id` + `year`
- **Calculates derived metrics**: Creates per-capita emissions and per-100k discharge rates

**Output**: `mvp/data/curated/eu_climate_health.parquet` - a single dataset with all metrics aligned by region and year

**Note**: This is MVP-specific. Production uses the processed files directly in the star schema database.


In [None]:
from pathlib import Path
import sys

PROJECT_ROOT = Path.cwd().resolve()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent
if PROJECT_ROOT.name == "mvp":
    PROJECT_ROOT = PROJECT_ROOT.parent
SRC_DIR = PROJECT_ROOT / "mvp" / "src"
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

PROJECT_ROOT


In [None]:
from pipeline.harmonize import build_curated_dataset

# Define input paths (from shared processed data)
emissions_path = PROJECT_ROOT / "data" / "processed" / "emissions_nuts2.parquet"
cod_path = PROJECT_ROOT / "data" / "processed" / "health_causes_of_death.parquet"
discharges_path = PROJECT_ROOT / "data" / "processed" / "health_hospital_discharges.parquet"
population_path = PROJECT_ROOT / "data" / "processed" / "population_nuts2.parquet"

# Output path (MVP-specific curated dataset)
output_path = PROJECT_ROOT / "mvp" / "data" / "curated" / "eu_climate_health.parquet"

# Build the curated dataset
curated_path = build_curated_dataset(
    emissions_path=emissions_path,
    cod_path=cod_path,
    discharges_path=discharges_path,
    population_path=population_path,
    output_path=output_path,
)

print(f"✓ Curated dataset created: {curated_path}")


In [None]:
import pandas as pd

# Preview the curated dataset
curated = pd.read_parquet(curated_path)

print(f"Shape: {curated.shape}")
print(f"\nColumns ({len(curated.columns)}):")
print(list(curated.columns))
print(f"\nFirst few rows:")
curated.head(5)


In [None]:
# Show sample of calculated metrics
metric_cols = [
    'total_emissions_kt',
    'emissions_per_capita_tonnes',
    'population',
    'cod_all_resp_rate',
    'discharge_all_resp',
    'discharge_all_resp_per_100k'
]

available_metrics = [col for col in metric_cols if col in curated.columns]
print(f"Sample metrics (showing {len(available_metrics)} of {len(metric_cols)} requested):")
curated[['nuts_id', 'nuts_label', 'year'] + available_metrics].head(10)
