# 01 – Ingest EDGAR Emissions

This notebook processes the EDGAR v8.0 workbook into a tidy columnar dataset under `data/processed/`. 

**Key features:**
- **Preserves all geographic levels**: Keeps country (2 chars), NUTS1 (3 chars), NUTS2 (4 chars), and NUTS3 (5+ chars) codes
- **Standardizes codes**: `.str.strip().str.upper()` → `nuts_id` (for consistency)
- **Handles missing values**: Drops rows with null emissions
- **Melts wide format**: Converts year columns (Y_1990, Y_1991, etc.) to long format

**Note**: All geographic levels are preserved for maximum flexibility in the star schema. Filtering to specific NUTS levels can be done during ETL or in queries.

All processing code is contained directly in this notebook - no external Python modules required.


In [1]:
from pathlib import Path
from typing import Dict, Iterable, Optional
import pandas as pd

# Setup paths
PROJECT_ROOT = Path.cwd().resolve()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent

PROJECT_ROOT


WindowsPath('C:/Users/narek.pirumyan/Desktop/IAE/2025/Big Data/Capstone Project/air-health-eu')

In [2]:
# Configuration
EDGAR_SHEETS: Dict[str, str] = {
    "Fossil CO2 AR5": "fossil_co2",
    "CH4_AR5": "ch4",
    "N2O_AR5": "n2o",
    "F-gas AR5": "f_gas",
}

DEFAULT_SECTOR_GROUPS: Dict[str, str] = {
    "Agriculture": "agriculture",
    "Buildings": "buildings",
    "Energy": "energy",
    "Industry": "industry",
    "Transport": "transport",
    "Dom_Avi": "transport",
    "Dom_Ship": "transport",
    "Waste": "waste",
}

def _read_sheet(workbook_path: Path, sheet_name: str, gas_label: str, filter_nuts2: bool = False) -> pd.DataFrame:
    """Load a single sheet from the EDGAR workbook and return a tidy dataframe."""
    df = pd.read_excel(workbook_path, sheet_name=sheet_name, skiprows=5)
    df = df.dropna(subset=["NUTS 2"])

    id_cols = ["Substance", "ISO", "Country", "NUTS 2", "NUTS 2 desc", "Sector"]
    value_cols = [col for col in df.columns if col.startswith("Y_")]

    tidy = df.melt(
        id_vars=id_cols,
        value_vars=value_cols,
        var_name="year",
        value_name="emissions_kt_co2e",
    )

    tidy["year"] = tidy["year"].str.replace("Y_", "", regex=False).astype(int)
    tidy["gas_sheet"] = gas_label
    tidy = tidy.rename(
        columns={
            "Substance": "gas",
            "ISO": "country_iso",
            "Country": "country_name",
            "NUTS 2": "nuts_id",
            "NUTS 2 desc": "nuts_label",
            "Sector": "sector",
        }
    )

    tidy["nuts_id"] = tidy["nuts_id"].str.strip().str.upper()
    tidy["country_iso"] = tidy["country_iso"].str.strip().str.upper()
    tidy["gas"] = tidy["gas"].fillna(tidy["gas_sheet"])
    tidy = tidy.drop(columns=["gas_sheet"])

    # Optional: Filter to NUTS2 level only (4 characters) if requested
    # By default, we preserve all geographic levels for maximum flexibility
    if filter_nuts2:
        def _is_valid_nuts2(code: str) -> bool:
            if not isinstance(code, str):
                code = str(code)
            return len(code.strip()) == 4
        tidy = tidy[tidy["nuts_id"].apply(_is_valid_nuts2)].copy()

    return tidy

def ingest_edgar_emissions(
    workbook_path: Path,
    output_path: Optional[Path] = None,
    sector_groups: Optional[Dict[str, str]] = None,
    sheets: Optional[Iterable[str]] = None,
    filter_nuts2: bool = False,
) -> Path:
    """Convert the EDGAR workbook into a tidy parquet file."""
    workbook_path = Path(workbook_path)
    if output_path is None:
        output_path = PROJECT_ROOT / "data" / "processed" / "emissions_nuts2.parquet"
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    sheets_to_use = sheets or EDGAR_SHEETS.keys()
    frames: list[pd.DataFrame] = []

    for sheet in sheets_to_use:
        gas_label = EDGAR_SHEETS.get(sheet, sheet)
        frames.append(_read_sheet(workbook_path, sheet, gas_label, filter_nuts2=filter_nuts2))

    if not frames:
        raise ValueError("No EDGAR sheets were processed.")

    combined = pd.concat(frames, ignore_index=True)
    mapper = sector_groups or DEFAULT_SECTOR_GROUPS
    combined["sector_group"] = combined["sector"].map(mapper).fillna("other")

    combined = combined[[
        "nuts_id", "nuts_label", "country_iso", "country_name",
        "year", "gas", "sector", "sector_group", "emissions_kt_co2e",
    ]]

    combined = combined.dropna(subset=["emissions_kt_co2e"])
    combined.to_parquet(output_path, index=False)
    return output_path

# Run ingestion
edgar_xlsx = PROJECT_ROOT / "data" / "raw" / "emissions" / "EDGARv8.0_GHG_by substance_GWP100_AR5_NUTS2_1990_2022.xlsx"
output_path = PROJECT_ROOT / "data" / "processed" / "emissions_nuts2.parquet"

output_path = ingest_edgar_emissions(edgar_xlsx, output_path)
output_path


WindowsPath('C:/Users/narek.pirumyan/Desktop/IAE/2025/Big Data/Capstone Project/air-health-eu/data/processed/emissions_nuts2.parquet')

In [3]:
import pandas as pd

preview = pd.read_parquet(output_path)
preview.head()


Unnamed: 0,nuts_id,nuts_label,country_iso,country_name,year,gas,sector,sector_group,emissions_kt_co2e
0,AT,,AUT,Austria,1990,CO2,Dom_Avi,transport,64.581186
1,AT,,AUT,Austria,1990,CO2,Dom_Ship,transport,28.087658
2,AT11,Burgenland,AUT,Austria,1990,CO2,Agriculture,agriculture,0.617001
3,AT11,Burgenland,AUT,Austria,1990,CO2,Buildings,buildings,443.808536
4,AT11,Burgenland,AUT,Austria,1990,CO2,Energy,energy,240.746162


In [4]:
# Verify geographic code distribution
full_data = pd.read_parquet(output_path)
print(f"Total rows: {len(full_data):,}")
print(f"Unique geographic codes: {full_data['nuts_id'].nunique()}")
print(f"\nGeographic code length distribution:")
print(full_data['nuts_id'].str.len().value_counts().sort_index())
print(f"\nSample codes by length:")
for length in sorted(full_data['nuts_id'].str.len().unique()):
    sample = sorted(full_data[full_data['nuts_id'].str.len() == length]['nuts_id'].unique())[:5]
    print(f"  {length} chars: {sample}")


Total rows: 153,830
Unique geographic codes: 274

Geographic code length distribution:
nuts_id
2      6714
4    147116
Name: count, dtype: int64

Sample codes by length:
  2 chars: ['AT', 'BE', 'BG', 'CY', 'CZ']
  4 chars: ['AT11', 'AT12', 'AT13', 'AT21', 'AT22']
