In [1]:
import pandas as pd
import numpy as np
from glob import glob
import os

In [None]:
# ============================
# 1. Helper: process one city
# ============================

def build_monthly_from_ytd(pattern, city_name, country_name):
    
    files = glob(pattern)
    if not files:
        raise FileNotFoundError(f"No files found for pattern: {pattern}")

    monthly_list = []

    for f in files:
        df = pd.read_csv(f)

        # --- Normalize column names but keep original ---
        cols_lower = {c: c.lower().strip() for c in df.columns}
        df.rename(columns=cols_lower, inplace=True)

        # Date column is typically "Date (LT)" in PAQI files
        # After lowercasing: "date (lt)"
        date_col = None
        for c in df.columns:
            if "date" in c:
                date_col = c
                break
        if date_col is None:
            raise ValueError(f"No date-like column found in {f}")

        # PM2.5 concentration column is "Raw Conc." -> "raw conc."
        pm_col = None
        for c in df.columns:
            if "raw conc" in c:
                pm_col = c
                break
        if pm_col is None:
            raise ValueError(f"No 'Raw Conc.' column found in {f}")

        # --- Parse date and clean ---
        df["date"] = pd.to_datetime(df[date_col], errors="coerce")
        df = df.dropna(subset=["date"])

        # --- Extract pm25 and clean invalid values ---
        df["pm25"] = pd.to_numeric(df[pm_col], errors="coerce")
        # Remove obvious invalid codes (negative, crazy large)
        df.loc[df["pm25"] < 0, "pm25"] = np.nan
        df.loc[df["pm25"] > 1000, "pm25"] = np.nan
        df = df.dropna(subset=["pm25"])

        # --- Compute monthly average ---
        df["year_month"] = df["date"].dt.to_period("M")
        monthly = (df
                   .groupby("year_month")["pm25"]
                   .mean()
                   .reset_index())

        # Convert period to Timestamp (first of month)
        monthly["date"] = monthly["year_month"].dt.to_timestamp()
        monthly["city"] = city_name
        monthly["country"] = country_name

        monthly = monthly[["country", "city", "date", "pm25"]]
        monthly_list.append(monthly)

    # Combine across all YTD files for that city
    city_monthly = pd.concat(monthly_list, ignore_index=True)

    # In case multiple files produced overlapping months, average again
    city_monthly = (city_monthly
                    .groupby(["country", "city", "date"], as_index=False)["pm25"]
                    .mean()
                   )

    return city_monthly

In [None]:
# ==================================
# 2. Build new donor city monthly
# ==================================

donor_folder = r"pm25_donors"

baghdad_pattern = os.path.join(donor_folder, "Baghdad_PM2.5_*_YTD.csv")

baghdad_monthly = build_monthly_from_ytd(
    pattern=baghdad_pattern,
    city_name="Baghdad",
    country_name="Iraq"
)

donors_new = pd.concat([baghdad_monthly], ignore_index=True)

print("Baghdad monthly head:")
print(baghdad_monthly.head())

print("New donors shape:", donors_new.shape)

  df["date"] = pd.to_datetime(df[date_col], errors="coerce")


Baghdad monthly head:
  country     city       date       pm25
0    Iraq  Baghdad 2019-02-01  34.173554
1    Iraq  Baghdad 2019-03-01  29.316076
2    Iraq  Baghdad 2019-04-01  21.985975
3    Iraq  Baghdad 2019-05-01  33.347945
4    Iraq  Baghdad 2019-06-01  35.779494
New donors shape: (52, 4)


In [4]:
# ==================================
# 3. Append to existing panel
# ==================================

panel_path = r"pm25_panel.csv"
panel = pd.read_csv(panel_path)

# Ensure date is datetime, and same column names
panel["date"] = pd.to_datetime(panel["date"], errors="coerce")
panel = panel[["country", "city", "date", "pm25"]]

# Combine & re-aggregate in case any months overlap
panel_extended = pd.concat([panel, donors_new], ignore_index=True)

panel_extended = (panel_extended
                  .groupby(["country", "city", "date"], as_index=False)["pm25"]
                  .mean())

# Sort and save
panel_extended = panel_extended.sort_values(["country", "city", "date"])
out_path = r"pm25_panel.csv"
panel_extended.to_csv(out_path, index=False)

print("Saved extended panel to:", out_path)
print(panel_extended.head())
print(panel_extended.tail())

Saved extended panel to: pm25_panel.csv
      country   city       date        pm25
0  Bangladesh  Dhaka 2016-03-01  108.788410
1  Bangladesh  Dhaka 2016-04-01   49.787204
2  Bangladesh  Dhaka 2016-05-01   51.318548
3  Bangladesh  Dhaka 2016-06-01   39.015406
4  Bangladesh  Dhaka 2016-07-01   29.364738
        country      city       date       pm25
324  Tajikistan  Dushanbe 2023-09-01  63.004918
325  Tajikistan  Dushanbe 2023-10-01  43.313669
326  Tajikistan  Dushanbe 2023-11-01  46.177778
327  Tajikistan  Dushanbe 2023-12-01  79.358008
328  Tajikistan  Dushanbe 2024-01-01  36.000000
