In [54]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from IPython.display import display




NA_VALUES = ["---", "--", "", " ", "NA", "N/A"]
# =========================
# FILE FINDER
# =========================
def find_file(name, start=Path.cwd()):
    for ancestor in [start] + list(start.parents):
        matches = list(ancestor.rglob(name))
        if matches:
            return matches[0]
    return None


def find_files(file_map):
    found = {}
    for key, filename in file_map.items():
        path = find_file(filename)
        if path:
            found[key] = path
        else:
            print(f"[WARNING] File not found: {filename}")
    return found

eda_script_path = find_file("script_eda.py")
if eda_script_path is None:
    raise FileNotFoundError("❌ script_eda.py tidak ditemukan di parent directory")

# tambahkan BASE PROJECT ke sys.path
sys.path.append(str(eda_script_path.parent))

# sekarang bisa import
from script_eda import evaluate_dataset, extract_column_schema,find_internal_duplicate_columns,extract_single_schema,cek_value_data_column








In [55]:


# load merged data
path = find_file("merged_cuaca_ndvi_ispu.csv")
if path is None:
    raise FileNotFoundError("❌ File merged tidak ditemukan")

df = pd.read_csv(path)

df.head()


Unnamed: 0,tanggal,periode_data,stasiun,pm_sepuluh,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,parameter_pencemar_kritis,...,cloud_cover_min (%),wind_gusts_10m_mean (km/h),wind_speed_10m_mean (km/h),wind_gusts_10m_min (km/h),wind_speed_10m_min (km/h),surface_pressure_max (hPa),surface_pressure_min (hPa),lokasi,lokasi_clean,ndvi
0,2010-01-01,201001,DKI1 (Bunderan HI),60.0,4.0,73.0,27.0,14.0,73.0,CO,...,99.0,21.0,10.5,11.9,6.9,1009.3,1005.1,dki1_bundaranhi,DKI1,0.2023
1,2010-01-02,201001,DKI1 (Bunderan HI),32.0,2.0,16.0,33.0,9.0,33.0,O3,...,91.0,16.5,7.7,9.0,4.4,1009.9,1006.0,dki1_bundaranhi,DKI1,0.2023
2,2010-01-03,201001,DKI1 (Bunderan HI),27.0,2.0,19.0,20.0,9.0,27.0,PM10,...,81.0,18.4,9.4,11.9,6.5,1010.5,1006.5,dki1_bundaranhi,DKI1,0.2023
3,2010-01-04,201001,DKI1 (Bunderan HI),22.0,2.0,16.0,15.0,6.0,22.0,PM10,...,17.0,23.8,13.5,14.4,9.6,1009.1,1005.1,dki1_bundaranhi,DKI1,0.2023
4,2010-01-05,201001,DKI1 (Bunderan HI),25.0,2.0,17.0,15.0,8.0,25.0,PM10,...,99.0,21.6,11.1,10.4,7.8,1009.1,1006.0,dki1_bundaranhi,DKI1,0.2023


In [56]:
df = df.sort_values(
    ["lokasi_clean", "tanggal"]
).reset_index(drop=True)

df.head()


Unnamed: 0,tanggal,periode_data,stasiun,pm_sepuluh,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,parameter_pencemar_kritis,...,cloud_cover_min (%),wind_gusts_10m_mean (km/h),wind_speed_10m_mean (km/h),wind_gusts_10m_min (km/h),wind_speed_10m_min (km/h),surface_pressure_max (hPa),surface_pressure_min (hPa),lokasi,lokasi_clean,ndvi
0,2010-01-01,201001,DKI1 (Bunderan HI),60.0,4.0,73.0,27.0,14.0,73.0,CO,...,99.0,21.0,10.5,11.9,6.9,1009.3,1005.1,dki1_bundaranhi,DKI1,0.2023
1,2010-01-02,201001,DKI1 (Bunderan HI),32.0,2.0,16.0,33.0,9.0,33.0,O3,...,91.0,16.5,7.7,9.0,4.4,1009.9,1006.0,dki1_bundaranhi,DKI1,0.2023
2,2010-01-03,201001,DKI1 (Bunderan HI),27.0,2.0,19.0,20.0,9.0,27.0,PM10,...,81.0,18.4,9.4,11.9,6.5,1010.5,1006.5,dki1_bundaranhi,DKI1,0.2023
3,2010-01-04,201001,DKI1 (Bunderan HI),22.0,2.0,16.0,15.0,6.0,22.0,PM10,...,17.0,23.8,13.5,14.4,9.6,1009.1,1005.1,dki1_bundaranhi,DKI1,0.2023
4,2010-01-05,201001,DKI1 (Bunderan HI),25.0,2.0,17.0,15.0,8.0,25.0,PM10,...,99.0,21.6,11.1,10.4,7.8,1009.1,1006.0,dki1_bundaranhi,DKI1,0.2023


In [57]:
df["tanggal_next"] = df.groupby("lokasi_clean")["tanggal"].shift(-1)
df["kategori_next"] = df.groupby("lokasi_clean")["kategori"].shift(-1)

df[["lokasi_clean", "tanggal", "tanggal_next", "kategori", "kategori_next"]].head(10)


Unnamed: 0,lokasi_clean,tanggal,tanggal_next,kategori,kategori_next
0,DKI1,2010-01-01,2010-01-02,SEDANG,BAIK
1,DKI1,2010-01-02,2010-01-03,BAIK,BAIK
2,DKI1,2010-01-03,2010-01-04,BAIK,BAIK
3,DKI1,2010-01-04,2010-01-05,BAIK,BAIK
4,DKI1,2010-01-05,2010-01-06,BAIK,BAIK
5,DKI1,2010-01-06,2010-01-07,BAIK,BAIK
6,DKI1,2010-01-07,2010-01-08,BAIK,SEDANG
7,DKI1,2010-01-08,2010-01-09,SEDANG,SEDANG
8,DKI1,2010-01-09,2010-01-10,SEDANG,BAIK
9,DKI1,2010-01-10,2010-01-11,BAIK,SEDANG


In [58]:
# FORCE datetime conversion (robust)
df["tanggal"] = pd.to_datetime(df["tanggal"], errors="coerce")
df["tanggal_next"] = pd.to_datetime(df["tanggal_next"], errors="coerce")

# now detect valid day-to-day transitions
df["is_valid_transition"] = (
    (df["tanggal_next"] - df["tanggal"]) == pd.Timedelta(days=1)
)

df[["lokasi_clean", "tanggal", "tanggal_next", "is_valid_transition"]].head(10)


Unnamed: 0,lokasi_clean,tanggal,tanggal_next,is_valid_transition
0,DKI1,2010-01-01,2010-01-02,True
1,DKI1,2010-01-02,2010-01-03,True
2,DKI1,2010-01-03,2010-01-04,True
3,DKI1,2010-01-04,2010-01-05,True
4,DKI1,2010-01-05,2010-01-06,True
5,DKI1,2010-01-06,2010-01-07,True
6,DKI1,2010-01-07,2010-01-08,True
7,DKI1,2010-01-08,2010-01-09,True
8,DKI1,2010-01-09,2010-01-10,True
9,DKI1,2010-01-10,2010-01-11,True


In [59]:
# keep only true day-to-day transitions
transitions = df[df["is_valid_transition"]].copy()

# keep only columns needed for Markov
transitions = transitions[
    ["lokasi_clean", "tanggal", "kategori", "kategori_next"]
]

transitions.head()


Unnamed: 0,lokasi_clean,tanggal,kategori,kategori_next
0,DKI1,2010-01-01,SEDANG,BAIK
1,DKI1,2010-01-02,BAIK,BAIK
2,DKI1,2010-01-03,BAIK,BAIK
3,DKI1,2010-01-04,BAIK,BAIK
4,DKI1,2010-01-05,BAIK,BAIK


In [60]:
# how many valid transitions per station?
transitions.groupby("lokasi_clean").size()


lokasi_clean
DKI1    2766
DKI2    2838
DKI3    2586
DKI4    2995
DKI5    2718
dtype: int64

In [61]:
# collect all possible ISPU categories (states)
states = sorted(
    set(transitions["kategori"]) |
    set(transitions["kategori_next"])
)

states


['BAIK', 'BERBAHAYA', 'SANGAT TIDAK SEHAT', 'SEDANG', 'TIDAK SEHAT']

In [62]:
state_to_idx = {state: i for i, state in enumerate(states)}
idx_to_state = {i: state for state, i in state_to_idx.items()}

state_to_idx


{'BAIK': 0,
 'BERBAHAYA': 1,
 'SANGAT TIDAK SEHAT': 2,
 'SEDANG': 3,
 'TIDAK SEHAT': 4}

In [63]:
# build transition matrices for all stations
station_matrices = {}

for station in transitions["lokasi_clean"].unique():
    station_data = transitions[transitions["lokasi_clean"] == station]
    
    transition_counts = np.zeros((n_states, n_states), dtype=int)
    
    for _, row in station_data.iterrows():
        i = state_to_idx[row["kategori"]]
        j = state_to_idx[row["kategori_next"]]
        transition_counts[i, j] += 1
    
    station_matrices[station] = transition_counts


In [64]:
station_transition_probs = {}

for station, counts in station_matrices.items():
    row_sums = counts.sum(axis=1, keepdims=True)

    probs = np.zeros_like(counts, dtype=float)  # important!

    valid_rows = row_sums[:, 0] > 0
    probs[valid_rows] = counts[valid_rows] / row_sums[valid_rows]

    station_transition_probs[station] = probs


In [65]:
station = "DKI1"
station_transition_probs[station]


array([[0.5824    , 0.        , 0.        , 0.416     , 0.0016    ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.13067061, 0.        , 0.        , 0.82889546, 0.04043393],
       [0.01769912, 0.        , 0.        , 0.7079646 , 0.27433628]])

In [66]:
station = "DKI2"
station_transition_probs[station]


array([[0.6510989 , 0.        , 0.        , 0.31868132, 0.03021978],
       [0.        , 0.        , 1.        , 0.        , 0.        ],
       [0.        , 0.00735294, 0.71323529, 0.        , 0.27941176],
       [0.06771654, 0.        , 0.0015748 , 0.83307087, 0.0976378 ],
       [0.00462963, 0.        , 0.08333333, 0.43055556, 0.48148148]])

In [67]:
def pretty_matrix(matrix, states):
    return pd.DataFrame(
        matrix,
        index=[f"FROM_{s}" for s in states],
        columns=[f"TO_{s}" for s in states]
    )

# example: inspect DKI1
pretty_matrix(station_transition_probs["DKI1"], states)

Unnamed: 0,TO_BAIK,TO_BERBAHAYA,TO_SANGAT TIDAK SEHAT,TO_SEDANG,TO_TIDAK SEHAT
FROM_BAIK,0.5824,0.0,0.0,0.416,0.0016
FROM_BERBAHAYA,0.0,0.0,0.0,0.0,0.0
FROM_SANGAT TIDAK SEHAT,0.0,0.0,0.0,0.0,0.0
FROM_SEDANG,0.130671,0.0,0.0,0.828895,0.040434
FROM_TIDAK SEHAT,0.017699,0.0,0.0,0.707965,0.274336


In [68]:
for station in station_transition_probs:
    print(f"\n=== {station} ===")
    display(pretty_matrix(station_transition_probs[station], states))



=== DKI1 ===


Unnamed: 0,TO_BAIK,TO_BERBAHAYA,TO_SANGAT TIDAK SEHAT,TO_SEDANG,TO_TIDAK SEHAT
FROM_BAIK,0.5824,0.0,0.0,0.416,0.0016
FROM_BERBAHAYA,0.0,0.0,0.0,0.0,0.0
FROM_SANGAT TIDAK SEHAT,0.0,0.0,0.0,0.0,0.0
FROM_SEDANG,0.130671,0.0,0.0,0.828895,0.040434
FROM_TIDAK SEHAT,0.017699,0.0,0.0,0.707965,0.274336



=== DKI2 ===


Unnamed: 0,TO_BAIK,TO_BERBAHAYA,TO_SANGAT TIDAK SEHAT,TO_SEDANG,TO_TIDAK SEHAT
FROM_BAIK,0.651099,0.0,0.0,0.318681,0.03022
FROM_BERBAHAYA,0.0,0.0,1.0,0.0,0.0
FROM_SANGAT TIDAK SEHAT,0.0,0.007353,0.713235,0.0,0.279412
FROM_SEDANG,0.067717,0.0,0.001575,0.833071,0.097638
FROM_TIDAK SEHAT,0.00463,0.0,0.083333,0.430556,0.481481



=== DKI3 ===


Unnamed: 0,TO_BAIK,TO_BERBAHAYA,TO_SANGAT TIDAK SEHAT,TO_SEDANG,TO_TIDAK SEHAT
FROM_BAIK,0.672973,0.0,0.0,0.313514,0.013514
FROM_BERBAHAYA,0.0,0.0,0.0,0.0,0.0
FROM_SANGAT TIDAK SEHAT,0.0,0.0,0.142857,0.285714,0.571429
FROM_SEDANG,0.059686,0.0,0.001047,0.84712,0.092147
FROM_TIDAK SEHAT,0.006689,0.0,0.010033,0.588629,0.394649



=== DKI4 ===


Unnamed: 0,TO_BAIK,TO_BERBAHAYA,TO_SANGAT TIDAK SEHAT,TO_SEDANG,TO_TIDAK SEHAT
FROM_BAIK,0.606936,0.0,0.0,0.378613,0.014451
FROM_BERBAHAYA,0.0,0.0,0.0,0.0,0.0
FROM_SANGAT TIDAK SEHAT,0.0,0.0,0.375,0.375,0.25
FROM_SEDANG,0.068897,0.0,0.000499,0.805292,0.125312
FROM_TIDAK SEHAT,0.003135,0.0,0.00627,0.387147,0.603448



=== DKI5 ===


Unnamed: 0,TO_BAIK,TO_BERBAHAYA,TO_SANGAT TIDAK SEHAT,TO_SEDANG,TO_TIDAK SEHAT
FROM_BAIK,0.678643,0.0,0.0,0.305389,0.015968
FROM_BERBAHAYA,0.0,0.0,0.0,0.0,0.0
FROM_SANGAT TIDAK SEHAT,0.0,0.0,0.269231,0.038462,0.692308
FROM_SEDANG,0.089736,0.0,0.00176,0.778299,0.130205
FROM_TIDAK SEHAT,0.002058,0.0,0.032922,0.45679,0.50823


In [69]:
# last known category for each station
last_state_per_station = (
    df.sort_values(["lokasi_clean", "tanggal"])
      .groupby("lokasi_clean")
      .tail(1)
      .set_index("lokasi_clean")["kategori"]
)

last_state_per_station


lokasi_clean
DKI1    SEDANG
DKI2    SEDANG
DKI3    SEDANG
DKI4    SEDANG
DKI5    SEDANG
Name: kategori, dtype: object

In [70]:
predictions_20250901 = {}

for station, current_state in last_state_per_station.items():
    probs = station_transition_probs.get(station)

    # safety checks
    if probs is None or current_state not in state_to_idx:
        predictions_20250901[station] = current_state
        continue

    idx = state_to_idx[current_state]

    # if no information for this row, persist
    if probs[idx].sum() == 0:
        predictions_20250901[station] = current_state
    else:
        next_idx = probs[idx].argmax()
        predictions_20250901[station] = idx_to_state[next_idx]

predictions_20250901


{'DKI1': 'SEDANG',
 'DKI2': 'SEDANG',
 'DKI3': 'SEDANG',
 'DKI4': 'SEDANG',
 'DKI5': 'SEDANG'}

In [71]:
for station, pred in predictions_20250901.items():
    print(f"{station} → 2025-09-01 predicted ISPU: {pred}")


DKI1 → 2025-09-01 predicted ISPU: SEDANG
DKI2 → 2025-09-01 predicted ISPU: SEDANG
DKI3 → 2025-09-01 predicted ISPU: SEDANG
DKI4 → 2025-09-01 predicted ISPU: SEDANG
DKI5 → 2025-09-01 predicted ISPU: SEDANG


In [72]:
pred_df = (
    pd.DataFrame.from_dict(
        predictions_20250901,
        orient="index",
        columns=["kategori"]
    )
    .reset_index()
    .rename(columns={"index": "lokasi_clean"})
)

pred_df


Unnamed: 0,lokasi_clean,kategori
0,DKI1,SEDANG
1,DKI2,SEDANG
2,DKI3,SEDANG
3,DKI4,SEDANG
4,DKI5,SEDANG


In [74]:

# load merged data
path = find_file("sample_submission.csv")
if path is None:
    raise FileNotFoundError("❌ File merged tidak ditemukan")

submission = pd.read_csv(path)


submission


Unnamed: 0,id,category
0,2025-09-01_DKI1,
1,2025-09-01_DKI2,
2,2025-09-01_DKI3,
3,2025-09-01_DKI4,
4,2025-09-01_DKI5,
...,...,...
450,2025-11-30_DKI1,
451,2025-11-30_DKI2,
452,2025-11-30_DKI3,
453,2025-11-30_DKI4,


In [75]:
submission["lokasi_clean"] = submission["id"].str.split("_").str[-1]

submission


Unnamed: 0,id,category,lokasi_clean
0,2025-09-01_DKI1,,DKI1
1,2025-09-01_DKI2,,DKI2
2,2025-09-01_DKI3,,DKI3
3,2025-09-01_DKI4,,DKI4
4,2025-09-01_DKI5,,DKI5
...,...,...,...
450,2025-11-30_DKI1,,DKI1
451,2025-11-30_DKI2,,DKI2
452,2025-11-30_DKI3,,DKI3
453,2025-11-30_DKI4,,DKI4


In [76]:
submission["kategori"] = submission["lokasi_clean"].map(predictions_20250901)

submission


Unnamed: 0,id,category,lokasi_clean,kategori
0,2025-09-01_DKI1,,DKI1,SEDANG
1,2025-09-01_DKI2,,DKI2,SEDANG
2,2025-09-01_DKI3,,DKI3,SEDANG
3,2025-09-01_DKI4,,DKI4,SEDANG
4,2025-09-01_DKI5,,DKI5,SEDANG
...,...,...,...,...
450,2025-11-30_DKI1,,DKI1,SEDANG
451,2025-11-30_DKI2,,DKI2,SEDANG
452,2025-11-30_DKI3,,DKI3,SEDANG
453,2025-11-30_DKI4,,DKI4,SEDANG


In [77]:
submission["kategori"].isna().sum()


np.int64(0)

In [78]:
final_submission = submission[["id", "kategori"]]

final_submission


Unnamed: 0,id,kategori
0,2025-09-01_DKI1,SEDANG
1,2025-09-01_DKI2,SEDANG
2,2025-09-01_DKI3,SEDANG
3,2025-09-01_DKI4,SEDANG
4,2025-09-01_DKI5,SEDANG
...,...,...
450,2025-11-30_DKI1,SEDANG
451,2025-11-30_DKI2,SEDANG
452,2025-11-30_DKI3,SEDANG
453,2025-11-30_DKI4,SEDANG


In [79]:
final_submission.to_csv("submission_markov_1.csv", index=False)
