In [6]:
import pandas as pd
from pathlib import Path

# ===============================
# PATH
# ===============================



BASE_DIR = Path.cwd()

INPUT_DIR = BASE_DIR / "input"
OUTPUT_DIR = BASE_DIR / "output"

print("BASE_DIR :", BASE_DIR)
print("INPUT_DIR:", INPUT_DIR)
# ===============================
# KOLUMN STANDAR
# ===============================
STANDARD_COLUMNS = [
    "periode_data",
    "tanggal",
    "stasiun",
    "pm_sepuluh",
    "pm_duakomalima",
    "sulfur_dioksida",
    "karbon_monoksida",
    "ozon",
    "nitrogen_dioksida",
    "max",
    "parameter_pencemar_kritis",
    "kategori",
    "id"
]

# ===============================
# MAP NAMA KOLUMN
# ===============================
RENAME_MAP = {
    "pm10": "pm_sepuluh",
    "pm_10": "pm_sepuluh",
    "pm_sepuluh": "pm_sepuluh",
    "pm25": "pm_duakomalima",
    "pm_25": "pm_duakomalima",
    "pm_duakomalima": "pm_duakomalima",
    "so2": "sulfur_dioksida",
    "sulfur_dioksida": "sulfur_dioksida",
    "co": "karbon_monoksida",
    "karbon_monoksida": "karbon_monoksida",
    "o3": "ozon",
    "ozon": "ozon",
    "no2": "nitrogen_dioksida",
    "nitrogen_dioksida": "nitrogen_dioksida",
    "critical": "parameter_pencemar_kritis",
    "parameter_pencemar_kritis": "parameter_pencemar_kritis",
    "categori": "kategori",
    "category": "kategori",
    "kategori": "kategori",
    "lokasi_spku": "stasiun",
    "stasiun": "stasiun",
}


# ===============================
# UTILITIES
# ===============================
def clean_columns(df):
    df.columns = (
        df.columns
        .str.lower()
        .str.strip()
        .str.replace(r"[^\w]+", "_", regex=True)
    )
    return df


def fix_tanggal(df):
    """
    - Jika ada kolom 'bulan' → 2024 & 2025
    - Jika tidak → tahun lama
    """
    if "bulan" in df.columns:
        df["tahun"] = df["periode_data"].astype(str).str[:4]
        df["bulan"] = df["bulan"].astype(int)
        df["tanggal"] = df["tanggal"].astype(int)

        df["tanggal"] = pd.to_datetime(
            df["tahun"] + "-"
            + df["bulan"].astype(str).str.zfill(2) + "-"
            + df["tanggal"].astype(str).str.zfill(2),
            format="%Y-%m-%d",
            errors="coerce"
        )

        df = df.drop(columns=["tahun", "bulan"])
    else:
        df["tanggal"] = pd.to_datetime(df["tanggal"], errors="coerce")

    return df


def normalize_columns(df):
    df = clean_columns(df)
    df = df.rename(columns=RENAME_MAP)

    df = fix_tanggal(df)

    for col in STANDARD_COLUMNS:
        if col not in df.columns:
            df[col] = pd.NA

    return df[STANDARD_COLUMNS]


# ===============================
# LOOP PER FILE
# ===============================
for file in sorted(INPUT_DIR.glob("*.csv")):
    df = pd.read_csv(file)
    df_clean = normalize_columns(df)

    output_file = OUTPUT_DIR / file.name
    df_clean.to_csv(output_file, index=False)

    print(f"✔ saved: {output_file.name}")


BASE_DIR : c:\Users\USER\Desktop\DATAVIDIA\penyisihan-datavidia-10\ispu_named copy
INPUT_DIR: c:\Users\USER\Desktop\DATAVIDIA\penyisihan-datavidia-10\ispu_named copy\input
✔ saved: 2010_id.csv
✔ saved: 2011_id.csv
✔ saved: 2012_id.csv
✔ saved: 2013_id.csv
✔ saved: 2014_id.csv
✔ saved: 2015_id.csv
✔ saved: 2016_id.csv
✔ saved: 2017_id.csv
✔ saved: 2018_id.csv
✔ saved: 2019_id.csv
✔ saved: 2020_id.csv
✔ saved: 2021_id.csv
✔ saved: 2022_id.csv
✔ saved: 2023_id.csv
✔ saved: 2024_id.csv
✔ saved: 2025_id.csv
