In [1]:
import os
import re
import glob
import csv
import pandas as pd
from io import StringIO

# === CONFIG ===
INPUT_DIR = "solar_radiation_csv"
OUTPUT_LONG = "solar_annual_long.csv"
REF_COUNTRIES_PATH = "coordinates_countries_full_209.csv"
# ==============

def load_canonical_names(path):
    if not os.path.exists(path):
        print(f"[WARN] Reference countries file not found: {path}")
        return {}

    ext = os.path.splitext(path)[1].lower()
    if ext == ".csv":
        ref = pd.read_csv(path)
    else:
        ref = pd.read_excel(path)

    cols_lower = {c.lower().strip(): c for c in ref.columns}
    canon_col = None
    for key in ("area", "country", "country name", "name"):
        if key in cols_lower:
            canon_col = cols_lower[key]
            break

    if canon_col is None:
        print("[WARN] Could not find a canonical country column.")
        return {}

    canon_series = ref[canon_col].astype(str)

    def norm(s): return re.sub(r'[^a-z]', '', s.lower())

    mapping = {norm(name): name for name in canon_series}
    print(f"[INFO] Loaded {len(mapping)} canonical names.")
    return mapping

CANON_MAP = load_canonical_names(REF_COUNTRIES_PATH)

def align_to_canonical(name):
    cleaned = name.replace("_", " ").strip()
    if not CANON_MAP: return cleaned
    def norm(s): return re.sub(r"[^a-z]", "", s.lower())
    return CANON_MAP.get(norm(cleaned), cleaned)

def country_from_filename(path):
    base = os.path.basename(path)
    stem = os.path.splitext(base)[0]
    m = re.match(r"^(?P<country>.+?)_(?:-?\d+(?:\.\d+)?)_", stem)
    if m: return align_to_canonical(m.group("country"))
    return align_to_canonical(stem)

def find_header_and_format(text):
    lines = text.replace("\r", "\n").split("\n")
    header_idx = None
    fmt = None

    for i, line in enumerate(lines):
        u = line.upper()
        if ("YEAR" in u) and ("MO" in u or "MONTH" in u) and ("ALLSKY" in u):
            header_idx = i; fmt = "monthly"; break
        if ("YEAR" in u) and ("ALLSKY" in u or "ANN" in u):
            if header_idx is None:
                header_idx = i; fmt = "annual"

    if header_idx is None:
        raise ValueError("No header found for solar radiation CSV.")

    return fmt, header_idx, lines

def parse_monthly_to_annual(text):
    fmt, idx, lines = find_header_and_format(text)
    df = pd.read_csv(StringIO("\n".join(lines[idx:])))
    df.columns = [c.upper() for c in df.columns]

    if fmt == "monthly":
        value_col = None
        for c in df.columns:
            if c.startswith("ALLSKY_SFC_SW_DWN"):
                value_col = c; break
        if value_col is None:
            raise ValueError("Solar value column not found.")

        df["YEAR"] = pd.to_numeric(df["YEAR"], errors="coerce")
        df[value_col] = pd.to_numeric(df[value_col], errors="coerce")
        df = df.dropna(subset=["YEAR", value_col])
        df["YEAR"] = df["YEAR"].astype(int)

        ann = df.groupby("YEAR")[value_col].sum().reset_index()
        ann = ann.rename(columns={"YEAR": "Year", value_col: "ANN"})
        return ann

    else:  # annual
        value_col = None
        for c in df.columns:
            if c.startswith("ALLSKY"):
                value_col = c; break
        if value_col is None:
            raise ValueError("Annual solar data missing ALLSKY column.")

        df["YEAR"] = pd.to_numeric(df["YEAR"], errors="coerce")
        df[value_col] = pd.to_numeric(df[value_col], errors="coerce")
        df = df.dropna()
        return df.rename(columns={"YEAR": "Year", value_col: "ANN"})[["Year", "ANN"]]

def load_year_ann(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()

    ann = parse_monthly_to_annual(text)
    country = country_from_filename(path)
    ann["Country"] = country
    return ann[["Country", "Year", "ANN"]]

def main():
    files = sorted(glob.glob(os.path.join(INPUT_DIR, "*.csv")))
    parts = []
    for fp in files:
        try:
            parts.append(load_year_ann(fp))
        except Exception as e:
            print(f"[Error {fp}] {e}")

    combined = pd.concat(parts)
    combined = combined.sort_values(["Country", "Year"]).reset_index(drop=True)
    combined.to_csv(OUTPUT_LONG, index=False, encoding="utf-8-sig")

    print(f"[OK] Saved solar radiation long format → {OUTPUT_LONG}")
    print(f"{len(combined)} rows, {combined['Country'].nunique()} countries")

if __name__ == "__main__":
    main()


[INFO] Loaded 209 canonical names.
[Error solar_radiation_csv\Afghanistan_34.5300_69.1700_ALLSKY_SFC_SW_DWN_1981_2023.csv] Annual solar data missing ALLSKY column.
[Error solar_radiation_csv\Albania_41.3300_19.8200_ALLSKY_SFC_SW_DWN_1981_2023.csv] Annual solar data missing ALLSKY column.
[Error solar_radiation_csv\Algeria_28.0300_1.6600_ALLSKY_SFC_SW_DWN_1981_2023.csv] Annual solar data missing ALLSKY column.
[Error solar_radiation_csv\Angola_-11.2000_17.8700_ALLSKY_SFC_SW_DWN_1981_2023.csv] Annual solar data missing ALLSKY column.
[Error solar_radiation_csv\Antigua_and_Barbuda_17.1200_-61.8500_ALLSKY_SFC_SW_DWN_1981_2023.csv] Annual solar data missing ALLSKY column.
[Error solar_radiation_csv\Argentina_-38.4200_-63.6200_ALLSKY_SFC_SW_DWN_1981_2023.csv] Annual solar data missing ALLSKY column.
[Error solar_radiation_csv\Armenia_40.0700_45.0400_ALLSKY_SFC_SW_DWN_1981_2023.csv] Annual solar data missing ALLSKY column.
[Error solar_radiation_csv\Australia_-25.2700_133.7800_ALLSKY_SFC_SW_D

ValueError: No objects to concatenate