In [5]:
import os
import re
import glob
import csv
import pandas as pd
from io import StringIO

# === CONFIG ===
# input directory is now a local folder
INPUT_DIR = "temperature_csv"

# output file in the same folder
OUTPUT_LONG = "temperature_annual_long.csv"

# reference coordinates file in the same folder
REF_COUNTRIES_PATH = "coordinates_countries_full_209.csv"
# ==============

def load_canonical_names(path):
    """
    Load canonical country names from your coordinates file.
    This file is assumed to have either an 'Area' or 'Country' column
    that contains the names you want to standardize to.

    Returns:
        mapping: dict normalized_name -> canonical_name
    """
    if not os.path.exists(path):
        print(f"[WARN] Reference countries file not found: {path}")
        return {}

    ext = os.path.splitext(path)[1].lower()
    if ext == ".csv":
        ref = pd.read_csv(path)
    else:
        ref = pd.read_excel(path)

    # Detect the canonical name column: 'Area' or 'Country'
    cols_lower = {c.lower().strip(): c for c in ref.columns}
    canon_col = None
    for key in ("area", "country", "country name", "name"):
        if key in cols_lower:
            canon_col = cols_lower[key]
            break

    if canon_col is None:
        print("[WARN] Could not find a canonical country column (Area/Country) in reference file.")
        print("       Columns found:", list(ref.columns))
        return {}

    canon_series = ref[canon_col].astype(str)

    def norm(s):
        return re.sub(r'[^a-z]', '', s.lower())

    mapping = {norm(name): name for name in canon_series}
    print(f"[INFO] Loaded {len(mapping)} canonical country names from '{canon_col}' in {path}")
    return mapping

CANON_MAP = load_canonical_names(REF_COUNTRIES_PATH)

def align_to_canonical(name):
    """
    Replace underscores with spaces, then try to align to canonical mapping
    (case/punctuation-insensitive). Fallback to the cleaned name if no match.

    Output string will MATCH exactly the name used in the coordinates file.
    """
    cleaned = name.replace('_', ' ').strip()
    if not CANON_MAP:
        return cleaned

    def norm(s):
        return re.sub(r'[^a-z]', '', s.lower())

    key = norm(cleaned)
    return CANON_MAP.get(key, cleaned)

def country_from_filename(path):
    """
    Extract the country name from filenames like:
      'Viet_Nam_15.0_100.0_1990_2013.csv'
      'Albania_41.33_19.82_1990_2013.csv'
      'Côte_d'Ivoire_7.54_-5.55_1980_2023.csv'

    We grab everything up to the first _<latitude> token (a signed float/int),
    then align that name to the canonical version from coordinates_countries_full_209.
    """
    base = os.path.basename(path)
    stem = os.path.splitext(base)[0]

    # Regex: capture everything until an underscore followed by a number (lat)
    m = re.match(r'^(?P<country>.+?)_(?:-?\d+(?:\.\d+)?)_(?:-?\d+(?:\.\d+)?)(?:_.+)?$', stem)
    if m:
        return align_to_canonical(m.group('country'))
    # Fallback: replace underscores with spaces and align
    return align_to_canonical(stem)

def sniff_delimiter(path, sample_lines=20):
    """Detect delimiter automatically; default to comma."""
    try:
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            sample = "".join([next(f) for _ in range(sample_lines)])
        dialect = csv.Sniffer().sniff(sample, delimiters=[",", ";", "\t", "|"])
        return dialect.delimiter
    except Exception:
        return ","

def find_header_and_format(text):
    """
    Find the actual tabular header in a POWER CSV blob and return:
      (format_type, header_idx, lines)

    format_type:
        'monthly' if it has YEAR+MO+PRECTOT*
        'annual'  if it has YEAR+ANN or YEAR+PRECTOT*
    """
    lines = text.replace("\r\n", "\n").replace("\r", "\n").split("\n")
    header_idx = None
    fmt = None

    for i, line in enumerate(lines):
        u = line.upper()
        # Try monthly header first
        if ("YEAR" in u) and (("MO" in u) or ("MONTH" in u)) and ("PRECTOT" in u):
            header_idx = i
            fmt = "monthly"
            break
        # Then annual style
        if ("YEAR" in u) and (("ANN" in u) or ("PRECTOT" in u)):
            if header_idx is None:  # only take as fallback
                header_idx = i
                fmt = "annual"

    if header_idx is None:
        raise ValueError("Could not locate tabular header containing YEAR + (MO/MONTH or ANN/PRECTOT).")
    return fmt, header_idx, lines

def parse_monthly_to_annual(text):
    """
    Parse a POWER CSV (monthly or annual) and return a DataFrame:
        Year | ANN
    where ANN is the annual total of the precipitation parameter.
    """
    fmt, header_idx, lines = find_header_and_format(text)
    data_csv = "\n".join(lines[header_idx:])
    df = pd.read_csv(StringIO(data_csv))

    # Normalize columns
    df.columns = [str(c).strip().upper() for c in df.columns]

    if fmt == "monthly":
        # Find value column
        value_col = None
        for cand in ["PRECTOTCORR_SUM", "PRECTOTCORR", "PRECTOT_SUM", "PRECTOT"]:
            if cand in df.columns:
                value_col = cand
                break
        if value_col is None:
            raise ValueError(f"Could not find monthly precipitation column in: {list(df.columns)}")

        year_col = "YEAR"
        month_col = "MO" if "MO" in df.columns else ("MONTH" if "MONTH" in df.columns else None)
        if (year_col not in df.columns) or (month_col is None):
            raise ValueError("Missing YEAR/MO columns in monthly data.")

        sub = df[[year_col, month_col, value_col]].copy()
        sub[year_col] = pd.to_numeric(sub[year_col], errors="coerce")
        sub[value_col] = pd.to_numeric(sub[value_col], errors="coerce")
        sub = sub.dropna(subset=[year_col, value_col])
        sub[year_col] = sub[year_col].astype(int)

        ann = sub.groupby(year_col, as_index=False)[value_col].sum()
        ann = ann.rename(columns={year_col: "Year", value_col: "ANN"})  # use 'ANN' for consistency
        return ann  # Year | ANN

    elif fmt == "annual":
        # Handle annual data directly
        value_col = None
        if "ANN" in df.columns:
            value_col = "ANN"
        else:
            for cand in ["PRECTOTCORR", "PRECTOT"]:
                if cand in df.columns:
                    value_col = cand
                    break
        if value_col is None:
            raise ValueError(f"Annual format detected but no ANN/PRECTOT column found in: {list(df.columns)}")

        if "YEAR" not in df.columns:
            raise ValueError("Annual format detected but missing YEAR column.")

        sub = df[["YEAR", value_col]].copy()
        sub["YEAR"] = pd.to_numeric(sub["YEAR"], errors="coerce")
        sub[value_col] = pd.to_numeric(sub[value_col], errors="coerce")
        sub = sub.dropna(subset=["YEAR", value_col])
        sub["YEAR"] = sub["YEAR"].astype(int)
        sub = sub.rename(columns={"YEAR": "Year", value_col: "ANN"})
        return sub

    else:
        raise ValueError(f"Unknown format: {fmt}")

def load_year_ann_precip(path):
    """
    Read a precipitation CSV (monthly or annual) and return:
        Country | Year | ANN
    where Country is aligned to the canonical name from coordinates_countries.
    """
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()

    ann_df = parse_monthly_to_annual(text)
    country = country_from_filename(path)   # <- name mapped to coordinates_countries_full_209
    ann_df["Country"] = country
    return ann_df[["Country", "Year", "ANN"]]

def main():
    files = sorted(glob.glob(os.path.join(INPUT_DIR, "*.csv")))
    if not files:
        print(f"No CSV files found in: {INPUT_DIR}")
        return

    parts = []
    for fpath in files:
        try:
            part = load_year_ann_precip(fpath)
            if part is not None and not part.empty:
                parts.append(part)
        except Exception as e:
            print(f"[Error] {os.path.basename(fpath)} -> {e}")

    if not parts:
        print("No data combined — please check file formats.")
        return

    combined = pd.concat(parts, ignore_index=True)
    combined = combined.sort_values(["Country", "Year"]).reset_index(drop=True)

    # Save long format only
    combined.to_csv(OUTPUT_LONG, index=False, encoding="utf-8-sig")
    print(f"[OK] Saved precipitation long format to {OUTPUT_LONG}")
    print(f"Rows: {len(combined):,} | Countries: {combined['Country'].nunique()} | Years: {combined['Year'].min()}–{combined['Year'].max()}")

if __name__ == "__main__":
    main()


[INFO] Loaded 209 canonical country names from 'Area' in coordinates_countries_full_209.csv
[OK] Saved precipitation long format to temperature_annual_long.csv
Rows: 8,987 | Countries: 209 | Years: 1981–2023
