In [9]:
import pandas as pd
from pathlib import Path
import re

FOLDER = Path(r"C:\Users\USER\Desktop\DATAVIDIA\penyisihan-datavidia-10\ISPU")

OUTPUT_FOLDER = Path(
    r"C:\Users\USER\Desktop\DATAVIDIA\penyisihan-datavidia-10\ispu_named copy\input"
)
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

PREFIX = "data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta"

for file in FOLDER.glob(f"{PREFIX}*.csv"):
    print(f"\nüìÑ Processing: {file.name}")

    # ===============================
    # Ambil tahun dari nama file
    # ===============================
    match = re.search(r"(20\d{2})", file.name)
    if not match:
        print("‚ö†Ô∏è Tahun tidak ditemukan, skip")
        continue

    file_year = int(match.group(1))

    # üëâ HANYA 2023
    if file_year != 2023:
        print("‚è≠Ô∏è Bukan 2023, dilewati")
        continue

    print("‚û°Ô∏è Tahun file:", file_year)

    df = pd.read_csv(file)

    # ===============================
    # PARSE TANGGAL (APA ADANYA)
    # ===============================
    df["tanggal"] = pd.to_datetime(df["tanggal"], errors="coerce")

    # ===============================
    # CEK TAHUN TANGGAL
    # ===============================
    tahun_asli = df["tanggal"].dt.year
    mask_mismatch = (tahun_asli.notna()) & (tahun_asli != 2023)

    if mask_mismatch.any():
        print("‚ö†Ô∏è WARNING: Tahun tanggal ‚â† 2023")
        print("Contoh:")
        print(df.loc[mask_mismatch, "tanggal"].head())
        print("Jumlah mismatch:", mask_mismatch.sum())

        # paksa ke 2023
        df.loc[mask_mismatch, "tanggal"] = (
            df.loc[mask_mismatch, "tanggal"]
            .apply(lambda x: x.replace(year=2023))
        )

    # ===============================
    # PERIODE DATA (SINKRON)
    # ===============================
    if "periode_data" in df.columns:
        df["periode_data"] = df["tanggal"].dt.strftime("%Y%m")

    # ===============================
    # KODE STASIUN
    # ===============================
    df["kode_stasiun"] = (
        df["stasiun"]
        .astype(str)
        .str.extract(r"(DKI\d+)")
        .fillna(df["stasiun"].astype(str).str.split().str[0])
    )

    # ===============================
    # ID
    # ===============================
    df["id"] = (
        df["tanggal"].dt.strftime("%Y-%m-%d")
        + "_"
        + df["kode_stasiun"]
    )

    # ===============================
    # SAVE
    # ===============================
    output_path = OUTPUT_FOLDER / "2023_fixed_id.csv"
    df.to_csv(output_path, index=False)

    print(f"‚úÖ Saved: {output_path}")



üìÑ Processing: data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-2023-komponen-data.csv
‚û°Ô∏è Tahun file: 2023
Contoh:
192   2022-12-01
193   2022-12-02
194   2022-12-03
195   2022-12-04
196   2022-12-05
Name: tanggal, dtype: datetime64[ns]
Jumlah mismatch: 155
‚úÖ Saved: C:\Users\USER\Desktop\DATAVIDIA\penyisihan-datavidia-10\ispu_named copy\input\2023_fixed_id.csv

üìÑ Processing: data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2024.csv
‚è≠Ô∏è Bukan 2023, dilewati

üìÑ Processing: data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2025.csv
‚è≠Ô∏è Bukan 2023, dilewati


In [None]:
import pandas as pd
from pathlib import Path
import re

FOLDER = Path(r"C:\Users\USER\Desktop\DATAVIDIA\penyisihan-datavidia-10\ISPU\data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2024.csv")
OUTPUT_FOLDER = Path(r"C:\Users\USER\Desktop\DATAVIDIA\penyisihan-datavidia-10\ispu_named copy\input")
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

PREFIX = "data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta"

for file in FOLDER.glob(f"{PREFIX}*.csv"):
    print(f"\nüìÑ Processing: {file.name}")

    match = re.search(r"(20\d{2})", file.name)
    if not match:
        print("‚ö†Ô∏è Tahun tidak ditemukan, skip")
        continue

    file_year = match.group(1)

    # hanya 2024 & 2025
    if file_year not in ["2024", "2025"]:
        print("‚è≠Ô∏è Bukan 2024/2025, dilewati")
        continue

    df = pd.read_csv(file)

    # ===== LOGIC DISUAIKAN =====
    df["tanggal"] = pd.to_datetime(
        df["periode_data"].astype(str).str[:4] + "-"
        + df["periode_data"].astype(str).str[4:6] + "-"
        + df["tanggal"].astype(str).str.zfill(2),
        errors="coerce"
    )

    df["kode_stasiun"] = df["stasiun"].str.extract(r"(DKI\d+)")

    df["id"] = (
        df["tanggal"].dt.strftime("%Y-%m-%d")
        + "_"
        + df["kode_stasiun"]
    )

    output_path = OUTPUT_FOLDER / f"{file_year}_id.csv"
    df.to_csv(output_path, index=False)

    print(f"‚úÖ Saved: {output_path}")



üìÑ Processing: data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-2023-komponen-data.csv
‚è≠Ô∏è Bukan 2024/2025, dilewati

üìÑ Processing: data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2024.csv
‚úÖ Saved: C:\Users\USER\Desktop\DATAVIDIA\penyisihan-datavidia-10\ispu_named copy\input\2024_id.csv

üìÑ Processing: data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2025.csv
‚úÖ Saved: C:\Users\USER\Desktop\DATAVIDIA\penyisihan-datavidia-10\ispu_named copy\input\2025_id.csv


In [16]:
import pandas as pd

from pathlib import Path

OUTPUT_FOLDER = Path(
    r"C:\Users\USER\Desktop\DATAVIDIA\penyisihan-datavidia-10\ispu_named copy\input"
)

OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

OUTPUT_FOLDER

df_2024 = pd.read_csv(
    r"C:\Users\USER\Desktop\DATAVIDIA\penyisihan-datavidia-10\ISPU\data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2024.csv"
)

df_2024["tahun"] = df_2024["periode_data"].astype(str).str[:4]
df_2024["bulan"] = df_2024["periode_data"].astype(str).str[4:6]
df_2024["tanggal"] = df_2024["tanggal"].astype(str).str.zfill(2)

df_2024["tanggal_full"] = pd.to_datetime(
    df_2024["tahun"] + "-" + df_2024["bulan"] + "-" + df_2024["tanggal"],
    errors="coerce"
)

df_2024["kode_stasiun"] = df_2024["stasiun"].str.extract(r"(DKI\d+)")
df_2024["id"] = (
    df_2024["tanggal_full"].dt.strftime("%Y-%m-%d")
    + "_"
    + df_2024["kode_stasiun"]
)

df_2024[["tanggal", "stasiun", "id"]].head()

df_2024.to_csv("2024_id.csv", index=False)

df_2024.to_csv(OUTPUT_FOLDER / "2024_id.csv", index=False)




In [17]:
import pandas as pd

from pathlib import Path

OUTPUT_FOLDER = Path(
    r"C:\Users\USER\Desktop\DATAVIDIA\penyisihan-datavidia-10\ispu_named copy\input"
)

OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

OUTPUT_FOLDER

df_2025 = pd.read_csv(
    r"C:\Users\USER\Desktop\DATAVIDIA\penyisihan-datavidia-10\ISPU\data-indeks-standar-pencemar-udara-(ispu)-di-provinsi-dki-jakarta-komponen-data-2025.csv"
)

df_2025["tahun"] = df_2025["periode_data"].astype(str).str[:4]
df_2025["bulan"] = df_2025["periode_data"].astype(str).str[4:6]
df_2025["tanggal"] = df_2025["tanggal"].astype(str).str.zfill(2)

df_2025["tanggal_full"] = pd.to_datetime(
    df_2025["tahun"] + "-" + df_2025["bulan"] + "-" + df_2025["tanggal"],
    errors="coerce"
)

df_2025["kode_stasiun"] = df_2025["stasiun"].str.extract(r"(DKI\d+)")
df_2025["id"] = (
    df_2025["tanggal_full"].dt.strftime("%Y-%m-%d")
    + "_"
    + df_2025["kode_stasiun"]
)

df_2025[["tanggal", "stasiun", "id"]].head()


df_2025.to_csv(OUTPUT_FOLDER / "2025_id.csv", index=False)


