# 01b – Ingest Eurostat Respiratory Health Data

Use this notebook to rerun the Eurostat ingestion helpers that convert the bulky TSV exports (causes of death + hospital discharges) into tidy parquet files stored under `data/processed/`.

**Key features:**
- **Preserves all geographic levels**: Keeps country (2 chars), NUTS1 (3 chars), NUTS2 (4 chars), and NUTS3 (5+ chars) codes
- **Parses dimension columns**: Extracts geo codes from comma-separated dimension columns
- **Standardizes geo codes**: `.str.strip().str.upper()` → `nuts_id` (for consistency with EDGAR data)
- **Handles missing data**: Converts `:` indicator to `pd.NA` before numeric conversion
- **Melts wide format**: Converts year columns to long format

**Note**: All geographic levels are preserved for maximum flexibility in the star schema. Filtering to specific NUTS levels can be done during ETL or in queries.


In [1]:
from pathlib import Path
from typing import Optional
import pandas as pd

# Setup paths
PROJECT_ROOT = Path.cwd().resolve()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent

PROJECT_ROOT


WindowsPath('C:/Users/narek.pirumyan/Desktop/IAE/2025/Big Data/Capstone Project/air-health-eu')

In [2]:
def _tidy_eurostat_tsv(path: Path, value_name: str, filter_nuts2: bool = False) -> pd.DataFrame:
    """Convert Eurostat multi-dimension TSV into a tidy dataframe."""
    df = pd.read_csv(path, sep="\t")
    dimension_col = df.columns[0]
    dimension_names = dimension_col.split(",")
    dims = df[dimension_col].str.split(",", expand=True)
    dims.columns = dimension_names

    wide_values = df.drop(columns=[dimension_col])
    tidy = pd.concat([dims, wide_values], axis=1)
    year_columns = [c for c in wide_values.columns if c.strip().isdigit()]

    tidy = tidy.melt(
        id_vars=dimension_names,
        value_vars=year_columns,
        var_name="year",
        value_name=value_name,
    )

    tidy["year"] = tidy["year"].astype(str).str.strip().astype(int)
    tidy[value_name] = (
        tidy[value_name]
        .astype(str)
        .str.strip()
        .replace(":", pd.NA)
    )
    tidy[value_name] = pd.to_numeric(tidy[value_name], errors="coerce")

    # Handle geo column (may be named 'geo' or 'geo\TIME_PERIOD')
    geo_col = None
    for col_name in tidy.columns:
        if 'geo' in col_name.lower() and 'time_period' in col_name.lower():
            geo_col = col_name
            break
    if not geo_col and 'geo' in tidy.columns:
        geo_col = 'geo'

    if geo_col:
        tidy["geo"] = tidy[geo_col].str.strip().str.upper()
        # Optional: Filter to NUTS2 level only (4 characters) if requested
        # By default, we preserve all geographic levels for maximum flexibility
        if filter_nuts2:
            def _is_valid_nuts2(code: str) -> bool:
                if not isinstance(code, str):
                    code = str(code)
                return len(code.strip()) == 4
            tidy = tidy[tidy["geo"].apply(_is_valid_nuts2)].copy()
        tidy = tidy.rename(columns={"geo": "nuts_id"})
        if geo_col != "nuts_id" and geo_col in tidy.columns:
            tidy = tidy.drop(columns=[geo_col])
    return tidy

def ingest_causes_of_death(
    tsv_path: Path,
    output_path: Optional[Path] = None,
    filter_nuts2: bool = False,
) -> Path:
    """Tidy the age-standardised causes-of-death dataset."""
    if output_path is None:
        output_path = PROJECT_ROOT / "data" / "processed" / "health_causes_of_death.parquet"
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    tidy = _tidy_eurostat_tsv(tsv_path, "age_standardised_rate_per_100k", filter_nuts2=filter_nuts2)
    tidy = tidy.rename(
        columns={
            "freq": "frequency",
            "unit": "unit_code",
            "sex": "sex",
            "age": "age_group",
            "icd10": "icd10_group",
        }
    )

    tidy = tidy[[
        "nuts_id", "year", "frequency", "unit_code",
        "sex", "age_group", "icd10_group", "age_standardised_rate_per_100k",
    ]]

    tidy.dropna(subset=["age_standardised_rate_per_100k"], inplace=True)
    tidy.to_parquet(output_path, index=False)
    return output_path

def ingest_hospital_discharges(
    tsv_path: Path,
    output_path: Optional[Path] = None,
    filter_nuts2: bool = False,
) -> Path:
    """Tidy the hospital discharge dataset for respiratory ICD10 codes."""
    if output_path is None:
        output_path = PROJECT_ROOT / "data" / "processed" / "health_hospital_discharges.parquet"
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    tidy = _tidy_eurostat_tsv(tsv_path, "discharges", filter_nuts2=filter_nuts2)
    tidy = tidy.rename(
        columns={
            "freq": "frequency",
            "age": "age_group",
            "indic_he": "indicator",
            "unit": "unit_code",
            "sex": "sex",
            "icd10": "icd10_group",
        }
    )

    tidy = tidy[[
        "nuts_id", "year", "frequency", "indicator", "unit_code",
        "sex", "age_group", "icd10_group", "discharges",
    ]]

    tidy.dropna(subset=["discharges"], inplace=True)
    tidy.to_parquet(output_path, index=False)
    return output_path

# Run ingestion
cod_tsv = PROJECT_ROOT / "data" / "raw" / "health" / "hlth_cd_asdr2.tsv"
discharge_tsv = PROJECT_ROOT / "data" / "raw" / "health" / "hlth_co_disch1t.tsv"

cod_path = ingest_causes_of_death(cod_tsv, PROJECT_ROOT / "data" / "processed" / "health_causes_of_death.parquet")
discharge_path = ingest_hospital_discharges(discharge_tsv, PROJECT_ROOT / "data" / "processed" / "health_hospital_discharges.parquet")

cod_path, discharge_path


(WindowsPath('C:/Users/narek.pirumyan/Desktop/IAE/2025/Big Data/Capstone Project/air-health-eu/data/processed/health_causes_of_death.parquet'),
 WindowsPath('C:/Users/narek.pirumyan/Desktop/IAE/2025/Big Data/Capstone Project/air-health-eu/data/processed/health_hospital_discharges.parquet'))

In [3]:
import pandas as pd

cod_preview = pd.read_parquet(cod_path).head()
discharge_preview = pd.read_parquet(discharge_path).head()

cod_preview, discharge_preview


(  nuts_id  year frequency unit_code sex age_group icd10_group  \
 0      AT  2011         A        RT   F     TOTAL     A-R_V-Y   
 1     AT1  2011         A        RT   F     TOTAL     A-R_V-Y   
 2    AT11  2011         A        RT   F     TOTAL     A-R_V-Y   
 3    AT12  2011         A        RT   F     TOTAL     A-R_V-Y   
 4    AT13  2011         A        RT   F     TOTAL     A-R_V-Y   
 
    age_standardised_rate_per_100k  
 0                          817.74  
 1                          857.95  
 2                          843.83  
 3                          860.17  
 4                          860.27  ,
   nuts_id  year frequency indicator unit_code sex age_group icd10_group  \
 0      BE  2000         A     INPAT        NR   T     TOTAL       A-T_Z   
 1      BG  2000         A     INPAT        NR   T     TOTAL       A-T_Z   
 2      CZ  2000         A     INPAT        NR   T     TOTAL       A-T_Z   
 3    CZ01  2000         A     INPAT        NR   T     TOTAL       A-T_Z   

In [4]:
# Verify geographic code distribution and column naming
cod_full = pd.read_parquet(cod_path)
discharge_full = pd.read_parquet(discharge_path)

print("Causes of Death:")
print(f"  Total rows: {len(cod_full):,}")
print(f"  Column 'nuts_id' exists: {'nuts_id' in cod_full.columns}")
if 'nuts_id' in cod_full.columns:
    print(f"  Unique geographic codes: {cod_full['nuts_id'].nunique()}")
    print(f"  Geographic code length distribution:")
    print(f"    {cod_full['nuts_id'].str.len().value_counts().sort_index().to_dict()}")

print("\nHospital Discharges:")
print(f"  Total rows: {len(discharge_full):,}")
print(f"  Column 'nuts_id' exists: {'nuts_id' in discharge_full.columns}")
if 'nuts_id' in discharge_full.columns:
    print(f"  Unique geographic codes: {discharge_full['nuts_id'].nunique()}")
    print(f"  Geographic code length distribution:")
    print(f"    {discharge_full['nuts_id'].str.len().value_counts().sort_index().to_dict()}")


Causes of Death:
  Total rows: 3,865,794
  Column 'nuts_id' exists: True
  Unique geographic codes: 491
  Geographic code length distribution:
    {2: 307991, 3: 975046, 4: 2573094, 9: 9663}

Hospital Discharges:
  Total rows: 11,418,846
  Column 'nuts_id' exists: True
  Unique geographic codes: 261
  Geographic code length distribution:
    {2: 1598597, 3: 1795694, 4: 8024555}
