In [1]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from IPython.display import display



NA_VALUES = ["---", "--", "", " ", "NA", "N/A"]
# =========================
# FILE FINDER
# =========================
def find_file(name, start=Path.cwd()):
    for ancestor in [start] + list(start.parents):
        matches = list(ancestor.rglob(name))
        if matches:
            return matches[0]
    return None


def find_files(file_map):
    found = {}
    for key, filename in file_map.items():
        path = find_file(filename)
        if path:
            found[key] = path
        else:
            print(f"[WARNING] File not found: {filename}")
    return found

eda_script_path = find_file("script_eda.py")
if eda_script_path is None:
    raise FileNotFoundError("❌ script_eda.py tidak ditemukan di parent directory")

# tambahkan BASE PROJECT ke sys.path
sys.path.append(str(eda_script_path.parent))

# sekarang bisa import
from script_eda import evaluate_dataset, extract_column_schema,find_internal_duplicate_columns,extract_single_schema,cek_value_data_column




In [2]:
path_libur = find_file("libur_processed.csv")

if path_libur is None:
    raise FileNotFoundError("❌ File merged tidak ditemukan")

df_libur = pd.read_csv(path_libur, na_values=NA_VALUES)

path_main_data = find_file("merged_cuaca_ndvi_ispu.csv")

if path_main_data is None:
    raise FileNotFoundError("❌ File merged tidak ditemukan")

df_main_data = pd.read_csv(path_main_data, na_values=NA_VALUES)

df_main_data.head()

Unnamed: 0,tanggal,periode_data,stasiun,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,...,cloud_cover_min (%),wind_gusts_10m_mean (km/h),wind_speed_10m_mean (km/h),wind_gusts_10m_min (km/h),wind_speed_10m_min (km/h),surface_pressure_max (hPa),surface_pressure_min (hPa),lokasi,lokasi_clean,ndvi
0,2010-01-01,201001,DKI1 (Bunderan HI),60.0,,4.0,73.0,27.0,14.0,73.0,...,99.0,21.0,10.5,11.9,6.9,1009.3,1005.1,dki1_bundaranhi,DKI1,0.2023
1,2010-01-02,201001,DKI1 (Bunderan HI),32.0,,2.0,16.0,33.0,9.0,33.0,...,91.0,16.5,7.7,9.0,4.4,1009.9,1006.0,dki1_bundaranhi,DKI1,0.2023
2,2010-01-03,201001,DKI1 (Bunderan HI),27.0,,2.0,19.0,20.0,9.0,27.0,...,81.0,18.4,9.4,11.9,6.5,1010.5,1006.5,dki1_bundaranhi,DKI1,0.2023
3,2010-01-04,201001,DKI1 (Bunderan HI),22.0,,2.0,16.0,15.0,6.0,22.0,...,17.0,23.8,13.5,14.4,9.6,1009.1,1005.1,dki1_bundaranhi,DKI1,0.2023
4,2010-01-05,201001,DKI1 (Bunderan HI),25.0,,2.0,17.0,15.0,8.0,25.0,...,99.0,21.6,11.1,10.4,7.8,1009.1,1006.0,dki1_bundaranhi,DKI1,0.2023


In [3]:
import pandas as pd

# =========================
# 1. Ensure datetime types
# =========================

df_libur["tanggal"] = pd.to_datetime(df_libur["tanggal"])
df_main_data["tanggal"] = pd.to_datetime(df_main_data["tanggal"])

# =========================
# 2. Remove existing is_libur if any
# =========================

df_main_data = df_main_data.drop(columns=["is_libur"], errors="ignore")

# =========================
# 3. Merge is_libur (ONLY ONE COLUMN)
# =========================

df_main_data = df_main_data.merge(
    df_libur[["tanggal", "is_libur"]],
    on="tanggal",
    how="left"
)

# =========================
# 4. Fill missing values
# =========================

df_main_data["is_libur"] = df_main_data["is_libur"].fillna(0).astype(int)

# =========================
# 5. Sanity check
# =========================

df_main_data[["tanggal", "is_libur"]].head()


Unnamed: 0,tanggal,is_libur
0,2010-01-01,1
1,2010-01-02,1
2,2010-01-03,1
3,2010-01-04,0
4,2010-01-05,0


In [4]:
from pathlib import Path

# =========================
# OUTPUT DIRECTORY
# =========================
OUTPUT_DIR = Path.cwd() / "datasets"
OUTPUT_DIR.mkdir(exist_ok=True)

# =========================
# EXPORT TO CSV
# =========================
output_path = OUTPUT_DIR / "merged_libur_cuaca_ispu_ndvi.csv"

df_main_data.to_csv(output_path, index=False)

print(f"✅ File berhasil diexport ke:\n{output_path}")


✅ File berhasil diexport ke:
c:\Users\veiro\Documents\datavidia\AIR-POLLUTION-PREDICTION-PENYISIHAN-DATAVIDIA-10\models\datasets\merged_libur_cuaca_ispu_ndvi.csv
