In [28]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from IPython.display import display




NA_VALUES = ["---", "--", "", " ", "NA", "N/A"]
# =========================
# FILE FINDER
# =========================
def find_file(name, start=Path.cwd()):
    for ancestor in [start] + list(start.parents):
        matches = list(ancestor.rglob(name))
        if matches:
            return matches[0]
    return None


def find_files(file_map):
    found = {}
    for key, filename in file_map.items():
        path = find_file(filename)
        if path:
            found[key] = path
        else:
            print(f"[WARNING] File not found: {filename}")
    return found

eda_script_path = find_file("script_eda.py")
if eda_script_path is None:
    raise FileNotFoundError("❌ script_eda.py tidak ditemukan di parent directory")

# tambahkan BASE PROJECT ke sys.path
sys.path.append(str(eda_script_path.parent))

# sekarang bisa import
from script_eda import evaluate_dataset, extract_column_schema,find_internal_duplicate_columns,extract_single_schema,cek_value_data_column








In [29]:
path = find_file("merged_libur_cuaca_ispu_ndvi.csv")

if path is None:
    raise FileNotFoundError("❌ File merged tidak ditemukan")

df = pd.read_csv(path, na_values=NA_VALUES)

df.head()

Unnamed: 0,tanggal,periode_data,stasiun,pm_sepuluh,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,parameter_pencemar_kritis,...,wind_gusts_10m_mean (km/h),wind_speed_10m_mean (km/h),wind_gusts_10m_min (km/h),wind_speed_10m_min (km/h),surface_pressure_max (hPa),surface_pressure_min (hPa),lokasi,lokasi_clean,ndvi,is_libur
0,2010-01-01,201001,DKI1 (Bunderan HI),60.0,4.0,73.0,27.0,14.0,73.0,CO,...,21.0,10.5,11.9,6.9,1009.3,1005.1,dki1_bundaranhi,DKI1,0.2023,1
1,2010-01-02,201001,DKI1 (Bunderan HI),32.0,2.0,16.0,33.0,9.0,33.0,O3,...,16.5,7.7,9.0,4.4,1009.9,1006.0,dki1_bundaranhi,DKI1,0.2023,1
2,2010-01-03,201001,DKI1 (Bunderan HI),27.0,2.0,19.0,20.0,9.0,27.0,PM10,...,18.4,9.4,11.9,6.5,1010.5,1006.5,dki1_bundaranhi,DKI1,0.2023,1
3,2010-01-04,201001,DKI1 (Bunderan HI),22.0,2.0,16.0,15.0,6.0,22.0,PM10,...,23.8,13.5,14.4,9.6,1009.1,1005.1,dki1_bundaranhi,DKI1,0.2023,0
4,2010-01-05,201001,DKI1 (Bunderan HI),25.0,2.0,17.0,15.0,8.0,25.0,PM10,...,21.6,11.1,10.4,7.8,1009.1,1006.0,dki1_bundaranhi,DKI1,0.2023,0


In [30]:
df["tanggal"] = pd.to_datetime(df["tanggal"])

kategori_list = df["kategori"].dropna().unique().tolist()
print(kategori_list)


['SEDANG', 'BAIK', 'TIDAK SEHAT', 'SANGAT TIDAK SEHAT', 'BERBAHAYA']


In [31]:
start_date = "2023-01-01"
end_date = "2025-08-31"

df_newest = df[
    (df["tanggal"] >= start_date) &
    (df["tanggal"] <= end_date)
]

kategori_counts = df_newest["kategori"].value_counts().sort_index()
print(kategori_counts)


kategori
BAIK                   617
SANGAT TIDAK SEHAT       4
SEDANG                3684
TIDAK SEHAT            526
Name: count, dtype: int64


In [32]:
# 1. Create a cross-tabulation of locations and categories
ct = pd.crosstab(df_newest['lokasi_clean'], df_newest['kategori'])

# 2. Normalize by row (index) to get decimals, then multiply by 100
# 'index' ensures the percentage is calculated per location
percentage_df = ct.div(ct.sum(axis=1), axis=0) * 100

# 3. Format to add the '%' sign and round (optional, but matches your output)
output = percentage_df.round(0).astype(int).astype(str) + '%'

print(output)

kategori     BAIK SANGAT TIDAK SEHAT SEDANG TIDAK SEHAT
lokasi_clean                                           
DKI1          15%                 0%    79%          6%
DKI2          10%                 0%    82%          8%
DKI3           5%                 0%    90%          5%
DKI4          11%                 0%    64%         25%
DKI5          23%                 0%    66%         11%


In [33]:
# 1. Buat cross-tabulation (ini secara default menghasilkan angka asli/count)
ct = pd.crosstab(df_newest['lokasi_clean'], df_newest['kategori'])

# (Opsional) Jika ingin mengurutkan kolom sesuai urutan kategori yang kamu sebutkan
kategori_order = ['BAIK', 'SEDANG', 'TIDAK SEHAT', 'SANGAT TIDAK SEHAT', 'BERBAHAYA']
ct = ct.reindex(columns=kategori_order)

print(ct)

kategori      BAIK  SEDANG  TIDAK SEHAT  SANGAT TIDAK SEHAT  BERBAHAYA
lokasi_clean                                                          
DKI1           149     764           56                   0        NaN
DKI2            99     792           78                   0        NaN
DKI3            48     875           44                   1        NaN
DKI4           102     616          239                   3        NaN
DKI5           219     637          109                   0        NaN
