In [1]:
from pathlib import Path
import pandas as pd

# =========================
# CONFIG
# =========================
BASE_DIR = Path.cwd()
OUTPUT_DIR = BASE_DIR / "output"
OUTPUT_DIR.mkdir(exist_ok=True)

NA_VALUES = ["---", "--", "", " ", "NA", "N/A"]

print("BASE_DIR  :", BASE_DIR)
print("OUTPUT_DIR:", OUTPUT_DIR)

# =========================
# FILE FINDER
# =========================
def find_file(name, start=Path.cwd()):
    for ancestor in [start] + list(start.parents):
        matches = list(ancestor.rglob(name))
        if matches:
            return matches[0]
    return None


# =========================
# LOAD DATA (PAKAI find_file)
# =========================
path_dki1 = find_file("cuaca-harian-dki1-bundaranhi.csv")
path_dki2 = find_file("cuaca-harian-dki2-kelapagading.csv")
path_dki3 = find_file("cuaca-harian-dki3-jagakarsa.csv")
path_dki4 = find_file("cuaca-harian-dki4-lubangbuaya.csv")
path_dki5 = find_file("cuaca-harian-dki5-kebonjeruk.csv")

if not all([path_dki1, path_dki2, path_dki3, path_dki4, path_dki5]):
    raise FileNotFoundError("❌ Salah satu file CSV tidak ditemukan")

df_dki1 = pd.read_csv(path_dki1, na_values=NA_VALUES)
df_dki1["lokasi"] = "dki1_bundaranhi"

df_dki2 = pd.read_csv(path_dki2, na_values=NA_VALUES)
df_dki2["lokasi"] = "dki2_kelapagading"

df_dki3 = pd.read_csv(path_dki3, na_values=NA_VALUES)
df_dki3["lokasi"] = "dki3_jagakarsa"

df_dki4 = pd.read_csv(path_dki4, na_values=NA_VALUES)
df_dki4["lokasi"] = "dki4_lubangbuaya"

df_dki5 = pd.read_csv(path_dki5, na_values=NA_VALUES)
df_dki5["lokasi"] = "dki5_kebonjeruk"

print("[OK] Semua file berhasil diload via find_file()")

# =========================
# MERGE
# =========================
df_cuaca = pd.concat(
    [df_dki1, df_dki2, df_dki3, df_dki4, df_dki5],
    ignore_index=True
)

print("\nMerged dataframe shape:", df_cuaca.shape)
print("Columns:", df_cuaca.columns.tolist())

# =========================
# BASIC VALIDATION
# =========================
print("\nDistribusi data per lokasi:")
print(df_cuaca["lokasi"].value_counts())

# =========================
# SAVE OUTPUT
# =========================
output_path = OUTPUT_DIR / "cuaca_harian_dki_processed.csv"
df_cuaca.to_csv(output_path, index=False)

print(f"\n[OK] File merged disimpan di:\n{output_path}")


BASE_DIR  : c:\Users\veiro\Documents\datavidia\AIR-POLLUTION-PREDICTION-PENYISIHAN-DATAVIDIA-10\external_data_processing\cuaca_harian
OUTPUT_DIR: c:\Users\veiro\Documents\datavidia\AIR-POLLUTION-PREDICTION-PENYISIHAN-DATAVIDIA-10\external_data_processing\cuaca_harian\output
[OK] Semua file berhasil diload via find_file()

Merged dataframe shape: (28610, 25)
Columns: ['time', 'temperature_2m_max (°C)', 'temperature_2m_min (°C)', 'precipitation_sum (mm)', 'precipitation_hours (h)', 'wind_speed_10m_max (km/h)', 'wind_direction_10m_dominant (°)', 'shortwave_radiation_sum (MJ/m²)', 'temperature_2m_mean (°C)', 'relative_humidity_2m_mean (%)', 'cloud_cover_mean (%)', 'surface_pressure_mean (hPa)', 'wind_gusts_10m_max (km/h)', 'winddirection_10m_dominant (°)', 'relative_humidity_2m_max (%)', 'relative_humidity_2m_min (%)', 'cloud_cover_max (%)', 'cloud_cover_min (%)', 'wind_gusts_10m_mean (km/h)', 'wind_speed_10m_mean (km/h)', 'wind_gusts_10m_min (km/h)', 'wind_speed_10m_min (km/h)', 'surface_