In [187]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from IPython.display import display

np.random.seed(42)



NA_VALUES = ["---", "--", "", " ", "NA", "N/A"]
# =========================
# FILE FINDER
# =========================
def find_file(name, start=Path.cwd()):
    for ancestor in [start] + list(start.parents):
        matches = list(ancestor.rglob(name))
        if matches:
            return matches[0]
    return None


def find_files(file_map):
    found = {}
    for key, filename in file_map.items():
        path = find_file(filename)
        if path:
            found[key] = path
        else:
            print(f"[WARNING] File not found: {filename}")
    return found

eda_script_path = find_file("script_eda.py")
if eda_script_path is None:
    raise FileNotFoundError("❌ script_eda.py tidak ditemukan di parent directory")

# tambahkan BASE PROJECT ke sys.path
sys.path.append(str(eda_script_path.parent))

# sekarang bisa import
from script_eda import evaluate_dataset, extract_column_schema,find_internal_duplicate_columns,extract_single_schema,cek_value_data_column








In [188]:


# load merged data
path = find_file("merged_cuaca_ndvi_ispu.csv")
if path is None:
    raise FileNotFoundError("❌ File merged tidak ditemukan")

df = pd.read_csv(path)

df.head()


Unnamed: 0,tanggal,periode_data,stasiun,pm_sepuluh,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,parameter_pencemar_kritis,...,cloud_cover_min (%),wind_gusts_10m_mean (km/h),wind_speed_10m_mean (km/h),wind_gusts_10m_min (km/h),wind_speed_10m_min (km/h),surface_pressure_max (hPa),surface_pressure_min (hPa),lokasi,lokasi_clean,ndvi
0,2010-01-01,201001,DKI1 (Bunderan HI),60.0,4.0,73.0,27.0,14.0,73.0,CO,...,99.0,21.0,10.5,11.9,6.9,1009.3,1005.1,dki1_bundaranhi,DKI1,0.2023
1,2010-01-02,201001,DKI1 (Bunderan HI),32.0,2.0,16.0,33.0,9.0,33.0,O3,...,91.0,16.5,7.7,9.0,4.4,1009.9,1006.0,dki1_bundaranhi,DKI1,0.2023
2,2010-01-03,201001,DKI1 (Bunderan HI),27.0,2.0,19.0,20.0,9.0,27.0,PM10,...,81.0,18.4,9.4,11.9,6.5,1010.5,1006.5,dki1_bundaranhi,DKI1,0.2023
3,2010-01-04,201001,DKI1 (Bunderan HI),22.0,2.0,16.0,15.0,6.0,22.0,PM10,...,17.0,23.8,13.5,14.4,9.6,1009.1,1005.1,dki1_bundaranhi,DKI1,0.2023
4,2010-01-05,201001,DKI1 (Bunderan HI),25.0,2.0,17.0,15.0,8.0,25.0,PM10,...,99.0,21.6,11.1,10.4,7.8,1009.1,1006.0,dki1_bundaranhi,DKI1,0.2023


In [189]:
df["tanggal"] = pd.to_datetime(df["tanggal"], errors="coerce")
df["lokasi_clean"] = df["lokasi_clean"].astype(str)
df["kategori"] = df["kategori"].astype(str)

df = df.dropna(subset=["tanggal", "lokasi_clean", "kategori"])

df[["tanggal", "lokasi_clean", "kategori"]].head()


Unnamed: 0,tanggal,lokasi_clean,kategori
0,2010-01-01,DKI1,SEDANG
1,2010-01-02,DKI1,BAIK
2,2010-01-03,DKI1,BAIK
3,2010-01-04,DKI1,BAIK
4,2010-01-05,DKI1,BAIK


In [190]:
df = df.sort_values(["lokasi_clean", "tanggal"]).reset_index(drop=True)


In [191]:
SPLIT_DATE = pd.to_datetime("2024-12-31")

df_train = df[df["tanggal"] <= SPLIT_DATE].copy()
df_val   = df[df["tanggal"] > SPLIT_DATE].copy()

print(df_train["tanggal"].min(), "→", df_train["tanggal"].max())
print(df_val["tanggal"].min(), "→", df_val["tanggal"].max())


2010-01-01 00:00:00 → 2024-12-31 00:00:00
2025-01-01 00:00:00 → 2025-08-31 00:00:00


In [192]:
# one cell after split

df_train["tanggal_next"] = df_train.groupby("lokasi_clean")["tanggal"].shift(-1)
df_train["kategori_next"] = df_train.groupby("lokasi_clean")["kategori"].shift(-1)

df_train["is_valid_transition"] = (
    (df_train["tanggal_next"] - df_train["tanggal"]) == pd.Timedelta(days=1)
)

train_transitions = df_train[df_train["is_valid_transition"]][
    ["lokasi_clean", "kategori", "kategori_next"]
]

train_transitions.head()


Unnamed: 0,lokasi_clean,kategori,kategori_next
0,DKI1,SEDANG,BAIK
1,DKI1,BAIK,BAIK
2,DKI1,BAIK,BAIK
3,DKI1,BAIK,BAIK
4,DKI1,BAIK,BAIK


In [193]:
# two cell after split
states = sorted(
    set(train_transitions["kategori"]) |
    set(train_transitions["kategori_next"])
)

state_to_idx = {s: i for i, s in enumerate(states)}
idx_to_state = {i: s for s, i in state_to_idx.items()}

states


['BAIK', 'BERBAHAYA', 'SANGAT TIDAK SEHAT', 'SEDANG', 'TIDAK SEHAT']

In [194]:
# three cell after split
station_matrices = {}
n_states = len(states)

for station in train_transitions["lokasi_clean"].unique():
    data = train_transitions[train_transitions["lokasi_clean"] == station]
    counts = np.zeros((n_states, n_states), dtype=int)

    for _, row in data.iterrows():
        i = state_to_idx[row["kategori"]]
        j = state_to_idx[row["kategori_next"]]
        counts[i, j] += 1

    station_matrices[station] = counts


In [195]:
# four cell after split

station_transition_probs = {}

for station, counts in station_matrices.items():
    probs = np.zeros_like(counts, dtype=float)
    row_sums = counts.sum(axis=1)

    valid = row_sums > 0
    probs[valid] = counts[valid] / row_sums[valid][:, None]

    station_transition_probs[station] = probs


In [196]:
# five cell after split (FIXED)

def persistence_predict(val_pairs):
    # persistence baseline: tomorrow = today
    return val_pairs["kategori"].values


In [197]:
# six cell after split

def markov_predict_stochastic(df_val, temperature=1.3, seed=42):
    np.random.seed(seed)
    preds = []

    for _, row in df_val.iterrows():
        station = row["lokasi_clean"]
        current_state = row["kategori"]

        probs = station_transition_probs.get(station)
        if probs is None or current_state not in state_to_idx:
            preds.append(current_state)
            continue

        idx = state_to_idx[current_state]
        row_probs = probs[idx]

        if row_probs.sum() == 0:
            preds.append(current_state)
            continue

        # temperature scaling
        scaled = np.power(row_probs, 1 / temperature)
        scaled = scaled / scaled.sum()

        next_idx = np.random.choice(len(states), p=scaled)
        preds.append(idx_to_state[next_idx])

    return preds


In [198]:
# seven cell after split (FIXED)
from sklearn.metrics import accuracy_score

# build valid (t -> t+1) validation pairs
df_val = df_val.sort_values(["lokasi_clean", "tanggal"])

df_val["kategori_next"] = df_val.groupby("lokasi_clean")["kategori"].shift(-1)
df_val["tanggal_next"]  = df_val.groupby("lokasi_clean")["tanggal"].shift(-1)

df_val["is_valid"] = (
    (df_val["tanggal_next"] - df_val["tanggal"]) == pd.Timedelta(days=1)
)

val_pairs = df_val[df_val["is_valid"]].copy()

# ground truth = next day
y_true = val_pairs["kategori_next"].values

# predictions
baseline_preds = persistence_predict(val_pairs)
markov_preds   = markov_predict_stochastic(val_pairs)

print("Persistence accuracy:", accuracy_score(y_true, baseline_preds))
print("Enhanced Markov accuracy:", accuracy_score(y_true, markov_preds))


Persistence accuracy: 0.7219882055602359
Enhanced Markov accuracy: 0.5720303285593934


In [199]:
# eight cell after split

def hybrid_predict(val_pairs, threshold=0.15, temperature=1.3):
    preds = []

    for _, row in val_pairs.iterrows():
        station = row["lokasi_clean"]
        current_state = row["kategori"]

        probs = station_transition_probs.get(station)
        if probs is None or current_state not in state_to_idx:
            preds.append(current_state)
            continue

        idx = state_to_idx[current_state]
        row_probs = probs[idx]

        if row_probs.sum() == 0:
            preds.append(current_state)
            continue

        # probability of staying the same
        stay_prob = row_probs[idx]

        # if Markov strongly suggests change → trust it
        if 1 - stay_prob > threshold:
            scaled = np.power(row_probs, 1 / temperature)
            scaled = scaled / scaled.sum()
            next_idx = np.random.choice(len(states), p=scaled)
            preds.append(idx_to_state[next_idx])
        else:
            preds.append(current_state)  # persistence fallback

    return preds


In [200]:
# 9 cell after split
hybrid_preds = hybrid_predict(val_pairs)

print(
    "Hybrid accuracy:",
    accuracy_score(y_true, hybrid_preds)
)


Hybrid accuracy: 0.5585509688289806


In [201]:
# 10 cell after split
change_prob = {}

for station, probs in station_transition_probs.items():
    station_change = {}

    for state, idx in state_to_idx.items():
        row = probs[idx]

        if row.sum() == 0:
            station_change[state] = 0.0
        else:
            stay_prob = row[idx]
            station_change[state] = 1.0 - stay_prob

    change_prob[station] = station_change

# example inspection
change_prob["DKI1"]

{'BAIK': np.float64(0.4091680814940577),
 'BERBAHAYA': 0.0,
 'SANGAT TIDAK SEHAT': 0.0,
 'SEDANG': np.float64(0.16955579631635964),
 'TIDAK SEHAT': np.float64(0.7555555555555555)}

In [202]:
# NEW CELL 1 — class weights (tunable)

class_weights = {
    "SEDANG": 1.0,
    "BAIK": 1.5,
    "TIDAK SEHAT": 1.5,
}


In [203]:
    def weighted_markov_predict(df_input, temperature, weights):
        preds = []

        for _, row in df_input.iterrows():
            station = row["lokasi_clean"]
            current = row["kategori"]

            if station not in station_transition_probs or current not in state_to_idx:
                preds.append(current)
                continue

            probs = station_transition_probs[station]
            idx = state_to_idx[current]
            row_probs = probs[idx]

            if row_probs.sum() == 0:
                preds.append(current)
                continue

            scaled = np.power(row_probs, 1 / temperature)
            scaled = scaled / scaled.sum()

            weighted = np.array([
                scaled[i] * weights.get(idx_to_state[i], 1.0)
                for i in range(len(states))
            ])

            weighted = weighted / weighted.sum()
            next_idx = np.random.choice(len(states), p=weighted)

            preds.append(idx_to_state[next_idx])

        return preds


In [204]:
weighted_preds = weighted_markov_predict(
    val_pairs,
    temperature=1.3,
    weights=class_weights
)

print(
    "Weighted Markov accuracy:",
    accuracy_score(y_true, weighted_preds)
)

from collections import Counter
Counter(weighted_preds)


Weighted Markov accuracy: 0.556866048862679


Counter({'SEDANG': 682,
         'BAIK': 275,
         'TIDAK SEHAT': 226,
         'SANGAT TIDAK SEHAT': 4})

In [205]:
# NEW CELL 4 — weight tuning

weight_grid = [
    {"SEDANG": 1.0, "BAIK": 1.5, "TIDAK SEHAT": 1.5},
    {"SEDANG": 1.0, "BAIK": 2.0, "TIDAK SEHAT": 2.0},
    {"SEDANG": 1.0, "BAIK": 2.5, "TIDAK SEHAT": 2.5},
    {"SEDANG": 0.8, "BAIK": 2.0, "TIDAK SEHAT": 2.0},
    {"SEDANG": 0.7, "BAIK": 2.5, "TIDAK SEHAT": 2.5},
]

for w in weight_grid:
    preds = weighted_markov_predict(
        val_pairs,
        temperature=1.3,
        weights=w
    )
    acc = accuracy_score(y_true, preds)
    print(f"weights={w} → accuracy={acc:.4f}")


weights={'SEDANG': 1.0, 'BAIK': 1.5, 'TIDAK SEHAT': 1.5} → accuracy=0.5459
weights={'SEDANG': 1.0, 'BAIK': 2.0, 'TIDAK SEHAT': 2.0} → accuracy=0.5139
weights={'SEDANG': 1.0, 'BAIK': 2.5, 'TIDAK SEHAT': 2.5} → accuracy=0.4768
weights={'SEDANG': 0.8, 'BAIK': 2.0, 'TIDAK SEHAT': 2.0} → accuracy=0.4920
weights={'SEDANG': 0.7, 'BAIK': 2.5, 'TIDAK SEHAT': 2.5} → accuracy=0.4431


In [206]:
FINAL_CLASS_WEIGHTS = {
    "SEDANG": 1.0,
    "BAIK": 1.5,
    "TIDAK SEHAT": 1.5,
}
FINAL_TEMPERATURE = 1.3


In [207]:
sample_path = find_file("sample_submission.csv")
sample_sub = pd.read_csv(sample_path)

sample_sub.head()


Unnamed: 0,id,category
0,2025-09-01_DKI1,
1,2025-09-01_DKI2,
2,2025-09-01_DKI3,
3,2025-09-01_DKI4,
4,2025-09-01_DKI5,


In [208]:
sample_sub["lokasi_clean"] = sample_sub["id"].str.extract(r"(DKI\d)")
sample_sub.head()


Unnamed: 0,id,category,lokasi_clean
0,2025-09-01_DKI1,,DKI1
1,2025-09-01_DKI2,,DKI2
2,2025-09-01_DKI3,,DKI3
3,2025-09-01_DKI4,,DKI4
4,2025-09-01_DKI5,,DKI5


In [209]:
last_state = (
    df_full
    .sort_values("tanggal")
    .groupby("lokasi_clean")
    .tail(1)[["lokasi_clean", "kategori"]]
)

last_state


Unnamed: 0,lokasi_clean,kategori
2862,DKI1,SEDANG
12284,DKI4,SEDANG
8925,DKI3,SEDANG
5978,DKI2,SEDANG
15256,DKI5,SEDANG


In [210]:
sample_sub = sample_sub.merge(
    last_state,
    on="lokasi_clean",
    how="left"
)

sample_sub.head()


Unnamed: 0,id,category,lokasi_clean,kategori
0,2025-09-01_DKI1,,DKI1,SEDANG
1,2025-09-01_DKI2,,DKI2,SEDANG
2,2025-09-01_DKI3,,DKI3,SEDANG
3,2025-09-01_DKI4,,DKI4,SEDANG
4,2025-09-01_DKI5,,DKI5,SEDANG


In [211]:
from collections import Counter
Counter(sample_sub["kategori"])


Counter({'SEDANG': 455})

In [212]:
np.random.seed(42)  # reproducible submission

submission_preds = weighted_markov_predict(
    sample_sub,
    temperature=FINAL_TEMPERATURE,
    weights=FINAL_CLASS_WEIGHTS
)

sample_sub["kategori"] = submission_preds


In [213]:
Counter(sample_sub["kategori"])


Counter({'SEDANG': 282,
         'TIDAK SEHAT': 86,
         'BAIK': 84,
         'SANGAT TIDAK SEHAT': 3})

In [None]:
submission = sample_sub[["id", "kategori"]]
submission.to_csv("submission_markov_2.csv", index=False)
