In [1]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from IPython.display import display




NA_VALUES = ["---", "--", "", " ", "NA", "N/A"]
# =========================
# FILE FINDER
# =========================
def find_file(name, start=Path.cwd()):
    for ancestor in [start] + list(start.parents):
        matches = list(ancestor.rglob(name))
        if matches:
            return matches[0]
    return None


def find_files(file_map):
    found = {}
    for key, filename in file_map.items():
        path = find_file(filename)
        if path:
            found[key] = path
        else:
            print(f"[WARNING] File not found: {filename}")
    return found

eda_script_path = find_file("script_eda.py")
if eda_script_path is None:
    raise FileNotFoundError("❌ script_eda.py tidak ditemukan di parent directory")

# tambahkan BASE PROJECT ke sys.path
sys.path.append(str(eda_script_path.parent))

# sekarang bisa import
from script_eda import evaluate_dataset, extract_column_schema,find_internal_duplicate_columns,extract_single_schema,cek_value_data_column

path_merged_data = find_file("cuaca_harian_dki_processed.csv")

if path_merged_data is None:
    raise FileNotFoundError("❌ File merged tidak ditemukan")






In [2]:
df_cuaca = pd.read_csv(path_merged_data, na_values=NA_VALUES)

df_copy = df_cuaca.copy()

# =========================
# LOKASI CLEAN (UPPERCASE + 4 CHAR)
# =========================
df_copy["lokasi_clean"] = (
    df_copy["lokasi"]
    .astype(str)
    .str.upper()     # ⬅️ PENTING
    .str[:4]
)

# =========================
# ID = time (FULL) + lokasi_clean
# =========================
df_copy["id"] = (
    df_copy["time"].astype(str)
    + "_"
    + df_copy["lokasi_clean"]
)

df_copy.head()


Unnamed: 0,time,temperature_2m_max (°C),temperature_2m_min (°C),precipitation_sum (mm),precipitation_hours (h),wind_speed_10m_max (km/h),wind_direction_10m_dominant (°),shortwave_radiation_sum (MJ/m²),temperature_2m_mean (°C),relative_humidity_2m_mean (%),...,cloud_cover_min (%),wind_gusts_10m_mean (km/h),wind_speed_10m_mean (km/h),wind_gusts_10m_min (km/h),wind_speed_10m_min (km/h),surface_pressure_max (hPa),surface_pressure_min (hPa),lokasi,lokasi_clean,id
0,2010-01-01,29.4,24.4,4.0,14.0,16.0,246,16.24,26.6,81,...,99,21.0,10.5,11.9,6.9,1009.3,1005.1,dki1_bundaranhi,DKI1,2010-01-01_DKI1
1,2010-01-02,30.8,24.0,6.5,4.0,14.7,238,19.8,26.9,82,...,91,16.5,7.7,9.0,4.4,1009.9,1006.0,dki1_bundaranhi,DKI1,2010-01-02_DKI1
2,2010-01-03,30.4,24.3,7.6,11.0,12.6,244,17.32,26.7,83,...,81,18.4,9.4,11.9,6.5,1010.5,1006.5,dki1_bundaranhi,DKI1,2010-01-03_DKI1
3,2010-01-04,30.3,25.4,0.9,5.0,19.3,239,20.43,27.4,81,...,17,23.8,13.5,14.4,9.6,1009.1,1005.1,dki1_bundaranhi,DKI1,2010-01-04_DKI1
4,2010-01-05,29.9,24.8,14.3,7.0,15.9,247,15.86,26.4,83,...,99,21.6,11.1,10.4,7.8,1009.1,1006.0,dki1_bundaranhi,DKI1,2010-01-05_DKI1


In [3]:
print("Duplicate ID:", df_copy["id"].duplicated().sum())
print(df_copy[["lokasi", "lokasi_clean"]].drop_duplicates())


Duplicate ID: 0
                  lokasi lokasi_clean
0        dki1_bundaranhi         DKI1
5722   dki2_kelapagading         DKI2
11444     dki3_jagakarsa         DKI3
17166   dki4_lubangbuaya         DKI4
22888    dki5_kebonjeruk         DKI5


In [None]:
from pathlib import Path

# =========================
# OUTPUT DIRECTORY
# =========================
OUTPUT_DIR = Path.cwd() / "output"
OUTPUT_DIR.mkdir(exist_ok=True)

# =========================
# EXPORT TO CSV
# =========================
output_path = OUTPUT_DIR / "cuaca_harian_dki_clean.csv"

df_copy.to_csv(output_path, index=False)

print(f"✅ File berhasil diexport ke:\n{output_path}")
