In [131]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from IPython.display import display



NA_VALUES = ["---", "--", "", " ", "NA", "N/A"]
# =========================
# FILE FINDER
# =========================
def find_file(name, start=Path.cwd()):
    for ancestor in [start] + list(start.parents):
        matches = list(ancestor.rglob(name))
        if matches:
            return matches[0]
    return None


def find_files(file_map):
    found = {}
    for key, filename in file_map.items():
        path = find_file(filename)
        if path:
            found[key] = path
        else:
            print(f"[WARNING] File not found: {filename}")
    return found

eda_script_path = find_file("script_eda.py")
if eda_script_path is None:
    raise FileNotFoundError("❌ script_eda.py tidak ditemukan di parent directory")

# tambahkan BASE PROJECT ke sys.path
sys.path.append(str(eda_script_path.parent))

# sekarang bisa import
from script_eda import evaluate_dataset, extract_column_schema,find_internal_duplicate_columns,extract_single_schema,cek_value_data_column








In [132]:
path_main_data = find_file("merged_libur_cuaca_ispu_ndvi.csv")

if path_main_data is None:
    raise FileNotFoundError("❌ File merged tidak ditemukan")

df = pd.read_csv(path_main_data, na_values=NA_VALUES)

df.head()
df = df.copy() 
df.head()

Unnamed: 0,tanggal,periode_data,stasiun,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,...,wind_gusts_10m_mean (km/h),wind_speed_10m_mean (km/h),wind_gusts_10m_min (km/h),wind_speed_10m_min (km/h),surface_pressure_max (hPa),surface_pressure_min (hPa),lokasi,lokasi_clean,ndvi,is_libur
0,2010-01-01,201001,DKI1 (Bunderan HI),60.0,,4.0,73.0,27.0,14.0,73.0,...,21.0,10.5,11.9,6.9,1009.3,1005.1,dki1_bundaranhi,DKI1,0.2023,1
1,2010-01-02,201001,DKI1 (Bunderan HI),32.0,,2.0,16.0,33.0,9.0,33.0,...,16.5,7.7,9.0,4.4,1009.9,1006.0,dki1_bundaranhi,DKI1,0.2023,1
2,2010-01-03,201001,DKI1 (Bunderan HI),27.0,,2.0,19.0,20.0,9.0,27.0,...,18.4,9.4,11.9,6.5,1010.5,1006.5,dki1_bundaranhi,DKI1,0.2023,1
3,2010-01-04,201001,DKI1 (Bunderan HI),22.0,,2.0,16.0,15.0,6.0,22.0,...,23.8,13.5,14.4,9.6,1009.1,1005.1,dki1_bundaranhi,DKI1,0.2023,0
4,2010-01-05,201001,DKI1 (Bunderan HI),25.0,,2.0,17.0,15.0,8.0,25.0,...,21.6,11.1,10.4,7.8,1009.1,1006.0,dki1_bundaranhi,DKI1,0.2023,0


In [133]:
df["tanggal"] = pd.to_datetime(df["tanggal"], errors="coerce")
df["pm_sepuluh"] = pd.to_numeric(df["pm_sepuluh"], errors="coerce")

df[["tanggal", "pm_sepuluh"]].describe()

Unnamed: 0,tanggal,pm_sepuluh
count,15257,15257.0
mean,2019-04-17 16:07:53.802189312,53.990387
min,2010-01-01 00:00:00,2.0
25%,2016-01-08 00:00:00,42.0
50%,2019-03-17 00:00:00,55.0
75%,2023-07-25 00:00:00,65.0
max,2025-08-31 00:00:00,187.0
std,,18.910874


In [134]:
# CELL 2 - PM10 ISPU breakpoints (Indonesia)

PM10_BREAKPOINTS = [
    (0,    50,  0,   50),
    (50,  150, 51,  100),
    (150, 350, 101, 200),
    (350, 420, 201, 300),
    (420, np.inf, 301, 500),
]


In [135]:
# CELL 3 - PM10 to ISPU sub-index

def pm10_to_ispu(c):
    if pd.isna(c):
        return np.nan

    for C_lo, C_hi, I_lo, I_hi in PM10_BREAKPOINTS:
        if C_lo <= c <= C_hi:
            return ((I_hi - I_lo) / (C_hi - C_lo)) * (c - C_lo) + I_lo

    return np.nan


In [136]:
# CELL 4 - calculate PM10 sub-index

df["pm10_ispu"] = df["pm_sepuluh"].apply(pm10_to_ispu)

df[["pm_sepuluh", "pm10_ispu"]].head(10)


Unnamed: 0,pm_sepuluh,pm10_ispu
0,60.0,55.9
1,32.0,32.0
2,27.0,27.0
3,22.0,22.0
4,25.0,25.0
5,30.0,30.0
6,41.0,41.0
7,64.0,57.86
8,55.0,53.45
9,34.0,34.0


In [137]:
# CELL 5 - distribution check

df["pm10_ispu"].describe()


count    15257.000000
mean        49.778436
std         14.030837
min          2.000000
25%         42.000000
50%         53.450000
75%         58.350000
max        119.315000
Name: pm10_ispu, dtype: float64

In [138]:
# CELL 6 - prepare PM2.5

df["pm_duakomalima"] = pd.to_numeric(df["pm_duakomalima"], errors="coerce")

df[["tanggal", "pm_duakomalima"]].describe()


Unnamed: 0,tanggal,pm_duakomalima
count,15257,7000.0
mean,2019-04-17 16:07:53.802189312,75.620714
min,2010-01-01 00:00:00,10.0
25%,2016-01-08 00:00:00,60.0
50%,2019-03-17 00:00:00,76.0
75%,2023-07-25 00:00:00,91.0
max,2025-08-31 00:00:00,287.0
std,,24.085817


In [139]:
# CELL 6 - prepare PM2.5

df["pm_duakomalima"] = pd.to_numeric(df["pm_duakomalima"], errors="coerce")

df[["tanggal", "pm_duakomalima"]].describe()


Unnamed: 0,tanggal,pm_duakomalima
count,15257,7000.0
mean,2019-04-17 16:07:53.802189312,75.620714
min,2010-01-01 00:00:00,10.0
25%,2016-01-08 00:00:00,60.0
50%,2019-03-17 00:00:00,76.0
75%,2023-07-25 00:00:00,91.0
max,2025-08-31 00:00:00,287.0
std,,24.085817


In [140]:
# CELL 7 - PM2.5 ISPU breakpoints (Indonesia)

PM25_BREAKPOINTS = [
    (0.0,   15.5,   0,   50),
    (15.5,  55.4,  51,  100),
    (55.4, 150.4, 101,  200),
    (150.4,250.4, 201,  300),
    (250.4, np.inf,301,  500),
]


In [141]:
# CELL 8 - PM2.5 to ISPU sub-index

def pm25_to_ispu(c):
    if pd.isna(c):
        return np.nan

    for C_lo, C_hi, I_lo, I_hi in PM25_BREAKPOINTS:
        if C_lo <= c <= C_hi:
            return ((I_hi - I_lo) / (C_hi - C_lo)) * (c - C_lo) + I_lo

    return np.nan


In [142]:
# CELL 9 - calculate PM2.5 sub-index

df["pm25_ispu"] = df["pm_duakomalima"].apply(pm25_to_ispu)

df[["pm_duakomalima", "pm25_ispu"]]


Unnamed: 0,pm_duakomalima,pm25_ispu
0,,
1,,
2,,
3,,
4,,
...,...,...
15252,72.0,118.298947
15253,60.0,105.793684
15254,70.0,116.214737
15255,59.0,104.751579


In [143]:
# CELL 10 - PM2.5 ISPU distribution

df["pm25_ispu"].describe()


count    7000.000000
mean      121.341467
std        26.336524
min        32.258065
25%       105.793684
50%       122.467368
75%       138.098947
max       301.000000
Name: pm25_ispu, dtype: float64

In [144]:
# CELL 11 - prepare SO2

df["sulfur_dioksida"] = pd.to_numeric(df["sulfur_dioksida"], errors="coerce")

df[["tanggal", "sulfur_dioksida"]].describe()


Unnamed: 0,tanggal,sulfur_dioksida
count,15257,15257.0
mean,2019-04-17 16:07:53.802189312,27.984564
min,2010-01-01 00:00:00,0.0
25%,2016-01-08 00:00:00,15.0
50%,2019-03-17 00:00:00,26.0
75%,2023-07-25 00:00:00,39.0
max,2025-08-31 00:00:00,112.0
std,,15.655351


In [145]:
# CELL 12 - SO2 ISPU breakpoints (Indonesia)

SO2_BREAKPOINTS = [
    (0,    52,   0,   50),
    (52,  180,  51,  100),
    (180, 400, 101,  200),
    (400, 800, 201,  300),
    (800, np.inf, 301, 500),
]


In [146]:
# CELL 13 - SO2 to ISPU sub-index

def so2_to_ispu(c):
    if pd.isna(c):
        return np.nan

    for C_lo, C_hi, I_lo, I_hi in SO2_BREAKPOINTS:
        if C_lo <= c <= C_hi:
            return ((I_hi - I_lo) / (C_hi - C_lo)) * (c - C_lo) + I_lo

    return np.nan


In [147]:
# CELL 14 - calculate SO2 sub-index

df["so2_ispu"] = df["sulfur_dioksida"].apply(so2_to_ispu)

df[["sulfur_dioksida", "so2_ispu"]]

Unnamed: 0,sulfur_dioksida,so2_ispu
0,4.0,3.846154
1,2.0,1.923077
2,2.0,1.923077
3,2.0,1.923077
4,2.0,1.923077
...,...,...
15252,45.0,43.269231
15253,53.0,51.382812
15254,29.0,27.884615
15255,27.0,25.961538


In [148]:
# CELL 15 - SO2 ISPU distribution

df["so2_ispu"].describe()


count    15257.000000
mean        26.734937
std         14.687578
min          0.000000
25%         14.423077
50%         25.000000
75%         37.500000
max         73.968750
Name: so2_ispu, dtype: float64

In [149]:
# CELL 16 - prepare CO (carbon monoxide)

df["karbon_monoksida"] = pd.to_numeric(df["karbon_monoksida"], errors="coerce")

df[["tanggal", "karbon_monoksida"]].describe()


Unnamed: 0,tanggal,karbon_monoksida
count,15257,15257.0
mean,2019-04-17 16:07:53.802189312,20.248061
min,2010-01-01 00:00:00,0.0
25%,2016-01-08 00:00:00,11.0
50%,2019-03-17 00:00:00,18.0
75%,2023-07-25 00:00:00,26.0
max,2025-08-31 00:00:00,134.0
std,,12.157342


In [150]:
# CELL 17 - CO ISPU breakpoints (Indonesia, µg/m³)

CO_BREAKPOINTS = [
    (0,     4000,    0,   50),
    (4000,  8000,   51,  100),
    (8000, 15000,  101,  200),
    (15000,30000,  201,  300),
    (30000, np.inf,301,  500),
]


In [151]:
# CELL 21 - convert CO from mg/m³ to µg/m³ (Indonesia ISPU compliant)

df["karbon_monoksida_ugm3"] = df["karbon_monoksida"] * 1000

df["karbon_monoksida_ugm3"].describe()


count     15257.000000
mean      20248.060999
std       12157.342274
min           0.000000
25%       11000.000000
50%       18000.000000
75%       26000.000000
max      134000.000000
Name: karbon_monoksida_ugm3, dtype: float64

In [152]:
# CELL 18 - CO to ISPU sub-index

def co_to_ispu(c):
    if pd.isna(c):
        return np.nan

    for C_lo, C_hi, I_lo, I_hi in CO_BREAKPOINTS:
        if C_lo <= c <= C_hi:
            return ((I_hi - I_lo) / (C_hi - C_lo)) * (c - C_lo) + I_lo

    return np.nan


In [153]:
# CELL 22 - recompute CO ISPU (correct units)

df["co_ispu"] = df["karbon_monoksida_ugm3"].apply(co_to_ispu)

df["co_ispu"].describe()


count    15257.000000
mean       208.047657
std         74.795118
min          0.000000
25%        143.428571
50%        220.800000
75%        273.600000
max        301.000000
Name: co_ispu, dtype: float64

In [154]:
# CELL 20 - CO ISPU distribution

df["co_ispu"].describe()


count    15257.000000
mean       208.047657
std         74.795118
min          0.000000
25%        143.428571
50%        220.800000
75%        273.600000
max        301.000000
Name: co_ispu, dtype: float64

In [155]:
# CELL 23 - prepare O3 (ozone)

df["ozon"] = pd.to_numeric(df["ozon"], errors="coerce")

df[["tanggal", "ozon"]].describe()


Unnamed: 0,tanggal,ozon
count,15257,15257.0
mean,2019-04-17 16:07:53.802189312,53.523891
min,2010-01-01 00:00:00,2.0
25%,2016-01-08 00:00:00,25.0
50%,2019-03-17 00:00:00,42.0
75%,2023-07-25 00:00:00,71.0
max,2025-08-31 00:00:00,314.0
std,,40.567943


In [156]:
# CELL 24 - O3 ISPU breakpoints (Indonesia)

O3_BREAKPOINTS = [
    (0,    120,   0,   50),
    (120,  235,  51,  100),
    (235,  400, 101,  200),
    (400,  800, 201,  300),
    (800,  np.inf, 301, 500),
]


In [157]:
# CELL 25 - O3 to ISPU sub-index

def o3_to_ispu(c):
    if pd.isna(c):
        return np.nan

    for C_lo, C_hi, I_lo, I_hi in O3_BREAKPOINTS:
        if C_lo <= c <= C_hi:
            return ((I_hi - I_lo) / (C_hi - C_lo)) * (c - C_lo) + I_lo

    return np.nan


In [158]:
# CELL 26 - calculate O3 sub-index

df["o3_ispu"] = df["ozon"].apply(o3_to_ispu)

df[["ozon", "o3_ispu"]]


Unnamed: 0,ozon,o3_ispu
0,27.0,11.250000
1,33.0,13.750000
2,20.0,8.333333
3,15.0,6.250000
4,15.0,6.250000
...,...,...
15252,21.0,8.750000
15253,19.0,7.916667
15254,15.0,6.250000
15255,18.0,7.500000


In [159]:
df[["pm10_ispu", "pm25_ispu", "co_ispu", "o3_ispu"]].describe()


Unnamed: 0,pm10_ispu,pm25_ispu,co_ispu,o3_ispu
count,15257.0,7000.0,15257.0,15257.0
mean,49.778436,121.341467,208.047657,22.416113
std,14.030837,26.336524,74.795118,17.297531
min,2.0,32.258065,0.0,0.833333
25%,42.0,105.793684,143.428571,10.416667
50%,53.45,122.467368,220.8,17.5
75%,58.35,138.098947,273.6,29.583333
max,119.315,301.0,301.0,148.4


In [160]:
# CELL 28 - prepare NO2

df["nitrogen_dioksida"] = pd.to_numeric(df["nitrogen_dioksida"], errors="coerce")

df[["tanggal", "nitrogen_dioksida"]].describe()


Unnamed: 0,tanggal,nitrogen_dioksida
count,15257,15257.0
mean,2019-04-17 16:07:53.802189312,17.238426
min,2010-01-01 00:00:00,0.0
25%,2016-01-08 00:00:00,10.0
50%,2019-03-17 00:00:00,15.0
75%,2023-07-25 00:00:00,22.0
max,2025-08-31 00:00:00,202.0
std,,11.563211


In [161]:
# CELL 29 - NO2 ISPU breakpoints (Indonesia)

NO2_BREAKPOINTS = [
    (0,     80,    0,   50),
    (80,   200,   51,  100),
    (200, 1130,  101,  200),
    (1130,2260,  201,  300),
    (2260, np.inf,301, 500),
]


In [162]:
# CELL 30 - NO2 to ISPU sub-index

def no2_to_ispu(c):
    if pd.isna(c):
        return np.nan

    for C_lo, C_hi, I_lo, I_hi in NO2_BREAKPOINTS:
        if C_lo <= c <= C_hi:
            return ((I_hi - I_lo) / (C_hi - C_lo)) * (c - C_lo) + I_lo

    return np.nan


In [163]:
# CELL 31 - calculate NO2 sub-index

df["no2_ispu"] = df["nitrogen_dioksida"].apply(no2_to_ispu)

df[["nitrogen_dioksida", "no2_ispu"]]


Unnamed: 0,nitrogen_dioksida,no2_ispu
0,14.0,8.750
1,9.0,5.625
2,9.0,5.625
3,6.0,3.750
4,8.0,5.000
...,...,...
15252,16.0,10.000
15253,39.0,24.375
15254,24.0,15.000
15255,17.0,10.625


In [164]:
# CELL 32 - NO2 ISPU distribution

df["no2_ispu"].describe()


count    15257.000000
mean        10.769349
std          7.173301
min          0.000000
25%          6.250000
50%          9.375000
75%         13.750000
max        101.212903
Name: no2_ispu, dtype: float64

In [165]:
# CELL 33 - list ISPU sub-index columns

ISPU_COLS = [
    "pm10_ispu",
    "pm25_ispu",
    "so2_ispu",
    "co_ispu",
    "o3_ispu",
    "no2_ispu",
]


In [166]:
# CELL 34 - compute final ISPU value (numeric)

df["ispu_val"] = df[ISPU_COLS].max(axis=1)

In [167]:
# CELL 35 - determine critical pollutant

df["parameter_pencemar_kritis"] = (
    df[ISPU_COLS]
    .idxmax(axis=1)
    .str.replace("_ispu", "", regex=False)
)


In [171]:
# CELL 36 - sanity check ISPU

df[[
    "pm10_ispu",
    "pm25_ispu",
    "so2_ispu",
    "co_ispu",
    "o3_ispu",
    "no2_ispu",
    "ispu_val",
    "parameter_pencemar_kritis",
    "tanggal",
    "id", 
]]


Unnamed: 0,pm10_ispu,pm25_ispu,so2_ispu,co_ispu,o3_ispu,no2_ispu,ispu_val,parameter_pencemar_kritis,tanggal,id
0,55.9,,3.846154,301.000000,11.250000,8.750,301.000000,co,2010-01-01,2010-01-01_DKI1
1,32.0,,1.923077,207.600000,13.750000,5.625,207.600000,co,2010-01-02,2010-01-02_DKI1
2,27.0,,1.923077,227.400000,8.333333,5.625,227.400000,co,2010-01-03,2010-01-03_DKI1
3,22.0,,1.923077,207.600000,6.250000,3.750,207.600000,co,2010-01-04,2010-01-04_DKI1
4,25.0,,1.923077,214.200000,6.250000,5.000,214.200000,co,2010-01-05,2010-01-05_DKI1
...,...,...,...,...,...,...,...,...,...,...
15252,28.0,118.298947,43.269231,207.600000,8.750000,10.000,207.600000,co,2025-08-31,2025-08-31_DKI2
15253,28.0,105.793684,51.382812,100.000000,7.916667,24.375,105.793684,pm25,2025-08-31,2025-08-31_DKI3
15254,42.0,116.214737,27.884615,157.571429,6.250000,15.000,157.571429,co,2025-08-31,2025-08-31_DKI1
15255,47.0,104.751579,25.961538,129.285714,7.500000,10.625,129.285714,co,2025-08-31,2025-08-31_DKI4
