In [None]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from IPython.display import display

BASE_DIR = Path.cwd()
OUTPUT_DIR = BASE_DIR / "output"       

OUTPUT_DIR.mkdir(exist_ok=True)


print("BASE_DIR      :", BASE_DIR)
print("OUTPUT_DIR      :", OUTPUT_DIR)


# =========================
# FILE FINDER
# =========================
def find_file(name, start=Path.cwd()):
    for ancestor in [start] + list(start.parents):
        matches = list(ancestor.rglob(name))
        if matches:
            return matches[0]
    return None


def find_files(file_map):
    found = {}
    for key, filename in file_map.items():
        path = find_file(filename)
        if path:
            found[key] = path
        else:
            print(f"[WARNING] File not found: {filename}")
    return found



# =========================
# CONFIG
# =========================
FILES = {
    "dki1_bundaranhi": "cuaca-harian-dki1-bundaranhi.csv",
    "dki2_kelapagading": "cuaca-harian-dki2-kelapagading.csv",
    "dki3_jagakarsa": "cuaca-harian-dki3-jagakarsa.csv",
    "dki4_lubangbuaya": "cuaca-harian-dki4-lubangbuaya.csv",
    "dki5_kebonjeruk": "cuaca-harian-dki5-kebonjeruk.csv",
}

NA_VALUES = ["---", "--", "", " ", "NA", "N/A"]


# =========================
# CARI script_eda.py
# =========================
eda_script_path = find_file("script_eda.py")
if eda_script_path is None:
    raise FileNotFoundError("❌ script_eda.py tidak ditemukan di parent directory")

# tambahkan BASE PROJECT ke sys.path
sys.path.append(str(eda_script_path.parent))

# sekarang bisa import
from script_eda import evaluate_dataset, extract_column_schema,find_internal_duplicate_columns,count_rows_per_dataset


BASE_DIR      : c:\Users\veiro\Documents\datavidia\AIR-POLLUTION-PREDICTION-PENYISIHAN-DATAVIDIA-10\external_data_processing\cuaca_harian
OUTPUT_DIR      : c:\Users\veiro\Documents\datavidia\AIR-POLLUTION-PREDICTION-PENYISIHAN-DATAVIDIA-10\external_data_processing\cuaca_harian\output


In [2]:
# =========================
# RESOLVE PATH
# =========================
resolved_files = find_files(FILES)
dfs = {}

for lokasi, path in resolved_files.items():
    df_temp = pd.read_csv(path, na_values=NA_VALUES)

    # simpan dataframe per lokasi
    dfs[lokasi] = df_temp

    print(f"[OK] Loaded {path.name} | rows: {len(df_temp)}")


# =========================
# CONCAT
# =========================
if dfs:
    df_cuaca = pd.concat(dfs, ignore_index=True)
else:
    df_cuaca = pd.DataFrame()

print("\nFinal cuaca dataframe shape:", df_cuaca.shape)
print("Columns:", df_cuaca.columns.tolist())


[OK] Loaded cuaca-harian-dki1-bundaranhi.csv | rows: 5722
[OK] Loaded cuaca-harian-dki2-kelapagading.csv | rows: 5722
[OK] Loaded cuaca-harian-dki3-jagakarsa.csv | rows: 5722
[OK] Loaded cuaca-harian-dki4-lubangbuaya.csv | rows: 5722
[OK] Loaded cuaca-harian-dki5-kebonjeruk.csv | rows: 5722

Final cuaca dataframe shape: (28610, 24)
Columns: ['time', 'temperature_2m_max (°C)', 'temperature_2m_min (°C)', 'precipitation_sum (mm)', 'precipitation_hours (h)', 'wind_speed_10m_max (km/h)', 'wind_direction_10m_dominant (°)', 'shortwave_radiation_sum (MJ/m²)', 'temperature_2m_mean (°C)', 'relative_humidity_2m_mean (%)', 'cloud_cover_mean (%)', 'surface_pressure_mean (hPa)', 'wind_gusts_10m_max (km/h)', 'winddirection_10m_dominant (°)', 'relative_humidity_2m_max (%)', 'relative_humidity_2m_min (%)', 'cloud_cover_max (%)', 'cloud_cover_min (%)', 'wind_gusts_10m_mean (km/h)', 'wind_speed_10m_mean (km/h)', 'wind_gusts_10m_min (km/h)', 'wind_speed_10m_min (km/h)', 'surface_pressure_max (hPa)', 'surf

In [3]:
# print("\n" + "="*50)
# print("REPORT DATA DKI1 – BUNDARAN HI")
# print("="*50)

# evaluate_dataset(
#      dfs["dki1_bundaranhi"],
#      name="Cuaca DKI1 Bundaran HI",
#      unique="time"
# )


In [4]:
# print("\n" + "="*50)
# print("REPORT DATA DKI2 – KELAPA GADING")
# print("="*50)

# evaluate_dataset(
#     dfs["dki2_kelapagading"],
#     name="Cuaca DKI2 Kelapa Gading",
#     unique="time"
# )


In [5]:
# print("\n" + "="*50)
# print("REPORT DATA DKI3 – JAGAKARSA")
# print("="*50)

# evaluate_dataset(
#     dfs["dki3_jagakarsa"],
#     name="Cuaca DKI3 Jagakarsa",
#     unique="time"
# )


In [6]:
# print("\n" + "="*50)
# print("REPORT DATA DKI4 – LUBANG BUAYA")
# print("="*50)

# evaluate_dataset(
#     dfs["dki4_lubangbuaya"],
#     name="Cuaca DKI4 Lubang Buaya",
#     unique="time"
# )


In [7]:
# print("\n" + "="*50)
# print("REPORT DATA DKI5 – KEBON JERUK")
# print("="*50)

# evaluate_dataset(
#     dfs["dki5_kebonjeruk"],
#     name="Cuaca DKI5 Kebon Jeruk",
#     unique="time"
# )


In [8]:
column_schema = extract_column_schema(dfs)
column_schema


{'dki1_bundaranhi': {'columns': ['time',
   'temperature_2m_max (°C)',
   'temperature_2m_min (°C)',
   'precipitation_sum (mm)',
   'precipitation_hours (h)',
   'wind_speed_10m_max (km/h)',
   'wind_direction_10m_dominant (°)',
   'shortwave_radiation_sum (MJ/m²)',
   'temperature_2m_mean (°C)',
   'relative_humidity_2m_mean (%)',
   'cloud_cover_mean (%)',
   'surface_pressure_mean (hPa)',
   'wind_gusts_10m_max (km/h)',
   'winddirection_10m_dominant (°)',
   'relative_humidity_2m_max (%)',
   'relative_humidity_2m_min (%)',
   'cloud_cover_max (%)',
   'cloud_cover_min (%)',
   'wind_gusts_10m_mean (km/h)',
   'wind_speed_10m_mean (km/h)',
   'wind_gusts_10m_min (km/h)',
   'wind_speed_10m_min (km/h)',
   'surface_pressure_max (hPa)',
   'surface_pressure_min (hPa)'],
  'n_columns': 24},
 'dki2_kelapagading': {'columns': ['time',
   'temperature_2m_max (°C)',
   'temperature_2m_min (°C)',
   'precipitation_sum (mm)',
   'precipitation_hours (h)',
   'wind_speed_10m_max (km/h)',
  

In [9]:
internal_dups = find_internal_duplicate_columns(column_schema)
internal_dups


{}

In [10]:
row_summary = count_rows_per_dataset(dfs)
row_summary


{'dki1_bundaranhi': 5722,
 'dki2_kelapagading': 5722,
 'dki3_jagakarsa': 5722,
 'dki4_lubangbuaya': 5722,
 'dki5_kebonjeruk': 5722,
 '__total__': 28610}