# 02 – Exploratory Data Analysis

Quick exploratory views on the curated EU climate-health dataset to guide the dashboard design. Run the setup + load cells, then use the Plotly figures to explore patterns (per-capita emissions vs respiratory health, sector profiles, and temporal trends).


In [None]:
from pathlib import Path
import sys

PROJECT_ROOT = Path.cwd().resolve()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent
SRC_DIR = PROJECT_ROOT / "src"
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

DATA_PATH = PROJECT_ROOT / "data" / "curated" / "eu_climate_health.parquet"
DATA_PATH


In [None]:
import pandas as pd

curated = pd.read_parquet(DATA_PATH)
curated.head()


In [None]:
latest_year = curated['year'].max()
latest_snapshot = curated[curated['year'] == latest_year].copy()
sector_cols = [c for c in curated.columns if c.startswith('emissions_') and c.endswith('_kt')]
latest_year, sector_cols[:5]


In [None]:
import plotly.express as px

scatter_cols = ['nuts_label', 'country_iso', 'emissions_per_capita_tonnes', 'cod_copd_rate', 'population']
scatter_df = latest_snapshot.dropna(subset=['emissions_per_capita_tonnes', 'cod_copd_rate']).copy()
scatter_df['nuts_label'] = scatter_df['nuts_label'].fillna(scatter_df['nuts_id'])
fig_emissions_vs_copd = px.scatter(
    scatter_df,
    x='emissions_per_capita_tonnes',
    y='cod_copd_rate',
    color='country_iso',
    size='population',
    hover_name='nuts_label',
    trendline='ols',
    labels={
        'emissions_per_capita_tonnes': 'Emissions per capita (tonnes CO₂-eq)',
        'cod_copd_rate': 'COPD age-std rate per 100k'
    },
    title=f'Per-capita emissions vs COPD mortality (NUTS2, {latest_year})'
)
fig_emissions_vs_copd


In [None]:
top_regions = latest_snapshot.nlargest(10, 'total_emissions_kt')
sector_melt = top_regions[['nuts_id', 'nuts_label', 'total_emissions_kt'] + sector_cols].melt(
    id_vars=['nuts_id', 'nuts_label', 'total_emissions_kt'],
    value_vars=sector_cols,
    var_name='sector_group',
    value_name='emissions_kt'
)
sector_melt['sector_group'] = (
    sector_melt['sector_group']
    .str.replace('emissions_', '', regex=False)
    .str.replace('_kt', '', regex=False)
    .str.replace('_', ' ')
    .str.title()
)
fig_sector_stack = px.bar(
    sector_melt,
    x='nuts_label',
    y='emissions_kt',
    color='sector_group',
    title=f'Sector contribution for top emitters ({latest_year})',
    labels={'emissions_kt': 'Emissions (kt CO₂-eq)', 'nuts_label': 'NUTS2 region'}
)
fig_sector_stack


In [None]:
focus_countries = curated['country_iso'].value_counts().head(5).index.tolist()
trend_df = curated[curated['country_iso'].isin(focus_countries)]
trend_long = trend_df.melt(
    id_vars=['country_iso', 'year'],
    value_vars=['emissions_per_capita_tonnes', 'cod_all_resp_rate'],
    var_name='metric',
    value_name='value'
)
metric_labels = {
    'emissions_per_capita_tonnes': 'Per-capita emissions (tonnes)',
    'cod_all_resp_rate': 'Respiratory mortality rate (per 100k)'
}
trend_long['metric_label'] = trend_long['metric'].map(metric_labels)
fig_trends = px.line(
    trend_long,
    x='year',
    y='value',
    color='country_iso',
    facet_row='metric_label',
    markers=True,
    title='Per-capita emissions vs respiratory mortality (country averages)'
)
fig_trends.update_layout(height=700)
fig_trends


In [None]:
metrics_for_corr = ['total_emissions_kt', 'emissions_per_capita_tonnes', 'cod_all_resp_rate', 'cod_copd_rate', 'cod_asthma_rate']
corr_df = curated[metrics_for_corr].dropna().corr()
corr_df
