In [1]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from IPython.display import display



NA_VALUES = ["---", "--", "", " ", "NA", "N/A"]
# =========================
# FILE FINDER
# =========================
def find_file(name, start=Path.cwd()):
    for ancestor in [start] + list(start.parents):
        matches = list(ancestor.rglob(name))
        if matches:
            return matches[0]
    return None


def find_files(file_map):
    found = {}
    for key, filename in file_map.items():
        path = find_file(filename)
        if path:
            found[key] = path
        else:
            print(f"[WARNING] File not found: {filename}")
    return found

eda_script_path = find_file("script_eda.py")
if eda_script_path is None:
    raise FileNotFoundError("❌ script_eda.py tidak ditemukan di parent directory")

# tambahkan BASE PROJECT ke sys.path
sys.path.append(str(eda_script_path.parent))

# sekarang bisa import
from script_eda import evaluate_dataset, extract_column_schema,find_internal_duplicate_columns,extract_single_schema,cek_value_data_column




In [2]:
path_libur = find_file("dataset-libur-nasional-dan-weekend.csv")

if path_libur is None:
    raise FileNotFoundError("❌ File merged tidak ditemukan")

df_libur = pd.read_csv(path_libur, na_values=NA_VALUES)

df_libur.head()

Unnamed: 0,tanggal,is_holiday_nasional,nama_libur,is_weekend,day_name
0,2010-01-01,1,New Year's Day,0,Friday
1,2010-01-02,0,,1,Saturday
2,2010-01-03,0,,1,Sunday
3,2010-01-04,0,,0,Monday
4,2010-01-05,0,,0,Tuesday


In [3]:
# 1. make sure columns are numeric (just in case)
df_libur["is_holiday_nasional"] = df_libur["is_holiday_nasional"].fillna(0).astype(int)
df_libur["is_weekend"] = df_libur["is_weekend"].fillna(0).astype(int)

# 2. create new column (holiday logic)
df_libur["is_libur"] = (
    (df_libur["is_holiday_nasional"] == 1) |
    (df_libur["is_weekend"] == 1)
).astype(int)

# quick sanity check
df_libur[["tanggal", "is_holiday_nasional", "is_weekend", "is_libur"]].head()


Unnamed: 0,tanggal,is_holiday_nasional,is_weekend,is_libur
0,2010-01-01,1,0,1
1,2010-01-02,0,1,1
2,2010-01-03,0,1,1
3,2010-01-04,0,0,0
4,2010-01-05,0,0,0


In [4]:
# create datasets directory if not exists
output_dir = find_file("libur_processed.csv")

# output path
output_path = output_dir 

# save
df_libur.to_csv(output_path, index=False)

print(f"✅ Saved to: {output_path}")


✅ Saved to: c:\Users\veiro\Documents\datavidia\AIR-POLLUTION-PREDICTION-PENYISIHAN-DATAVIDIA-10\external_data_processing\libur_nasional\output\libur_processed.csv


In [5]:
path_sample = find_file("sample_submission.csv")
path_processed_libur = find_file("libur_processed.csv")

if path_sample is None:
    raise FileNotFoundError("❌ File merged tidak ditemukan")

if path_processed_libur is None:
    raise FileNotFoundError("❌ File libur tidak ditemukan")    

df_sample = pd.read_csv(path_sample, na_values=NA_VALUES)
df_libur_processed = pd.read_csv(path_processed_libur, na_values=NA_VALUES)

df_sample.head()

Unnamed: 0,id,category
0,2025-09-01_DKI1,
1,2025-09-01_DKI2,
2,2025-09-01_DKI3,
3,2025-09-01_DKI4,
4,2025-09-01_DKI5,


In [6]:
df_libur_processed.head()

Unnamed: 0,tanggal,is_holiday_nasional,nama_libur,is_weekend,day_name,is_libur
0,2010-01-01,1,New Year's Day,0,Friday,1
1,2010-01-02,0,,1,Saturday,1
2,2010-01-03,0,,1,Sunday,1
3,2010-01-04,0,,0,Monday,0
4,2010-01-05,0,,0,Tuesday,0


In [7]:
import pandas as pd

# =========================
# 1. Ensure datetime types
# =========================

df_libur_processed["tanggal"] = pd.to_datetime(df_libur_processed["tanggal"])

# =========================
# 2. Create tanggal in submission FIRST
# =========================

df_sample["tanggal"] = pd.to_datetime(
    df_sample["id"].str[:10],  # YYYY-MM-DD
    format="%Y-%m-%d"
)

# =========================
# 3. Remove old is_libur if exists
# =========================

df_sample = df_sample.drop(columns=["is_libur"], errors="ignore")

# =========================
# 4. Merge is_libur safely
# =========================

df_sample = df_sample.merge(
    df_libur_processed[["tanggal", "is_libur"]],
    on="tanggal",
    how="left"
)

# =========================
# 5. Fill missing values
# =========================

df_sample["is_libur"] = df_sample["is_libur"].fillna(0).astype(int)

# =========================
# 6. Drop helper column
# =========================

df_sample.drop(columns=["tanggal"], inplace=True)

# =========================
# 7. Final sanity check
# =========================

df_sample


Unnamed: 0,id,category,is_libur
0,2025-09-01_DKI1,,0
1,2025-09-01_DKI2,,0
2,2025-09-01_DKI3,,0
3,2025-09-01_DKI4,,0
4,2025-09-01_DKI5,,0
...,...,...,...
450,2025-11-30_DKI1,,1
451,2025-11-30_DKI2,,1
452,2025-11-30_DKI3,,1
453,2025-11-30_DKI4,,1


In [8]:
from pathlib import Path

# =========================
# OUTPUT DIRECTORY
# =========================
OUTPUT_DIR = Path.cwd() / "datasets"
OUTPUT_DIR.mkdir(exist_ok=True)

# =========================
# EXPORT TO CSV
# =========================
output_path = OUTPUT_DIR / "merged_sample_libur.csv"

df_sample.to_csv(output_path, index=False)

print(f"✅ File berhasil diexport ke:\n{output_path}")


✅ File berhasil diexport ke:
c:\Users\veiro\Documents\datavidia\AIR-POLLUTION-PREDICTION-PENYISIHAN-DATAVIDIA-10\models\datasets\merged_sample_libur.csv
