In [16]:
import pandas as pd
import numpy as np
from pathlib import Path
import lightgbm as lgb
import optuna

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score


In [17]:
PATH = Path("merged_libur_cuaca_ispu_ndvi.csv")
df = pd.read_csv(PATH, sep=";")

df["tanggal"] = pd.to_datetime(df["tanggal"], dayfirst=True)
df = df.sort_values(["lokasi_clean", "tanggal"]).reset_index(drop=True)

DROP_COLS = ["max", "parameter_pencemar_kritis", "time", "id", "stasiun"]
df = df.drop(columns=DROP_COLS, errors="ignore")

LABEL_MAP = {"BAIK": 0, "SEDANG": 1, "TIDAK SEHAT": 2}
df = df[df["kategori"].notna()].copy()
df["target"] = df["kategori"].map(LABEL_MAP).astype(int)


In [18]:
# 3. REINDEX HARIAN

def reindex_daily(g):
    idx = pd.date_range(g["tanggal"].min(), g["tanggal"].max(), freq="D")
    g = g.set_index("tanggal").reindex(idx)
    g["lokasi_clean"] = g["lokasi_clean"].iloc[0]
    return g.reset_index().rename(columns={"index": "tanggal"})

df = df.groupby("lokasi_clean", group_keys=False).apply(reindex_daily)

# 4. HANDLE PM2.5 MISSING
df["pm25_missing"] = df["pm_duakomalima"].isna().astype(int)

median_pm25 = (
    df[df["tanggal"] >= "2021-01-01"]
    .groupby("lokasi_clean")["pm_duakomalima"]
    .median()
)

df["pm_duakomalima"] = df["pm_duakomalima"].fillna(
    df["lokasi_clean"].map(median_pm25)
)

df = df[df["target"].notna()].copy()


# 5. TIME FEATURES
df["month"] = df["tanggal"].dt.month
df["dayofyear"] = df["tanggal"].dt.dayofyear
df["dayofweek"] = df["tanggal"].dt.dayofweek

df["month_sin"] = np.sin(2*np.pi*df["month"]/12)
df["month_cos"] = np.cos(2*np.pi*df["month"]/12)

df["doy_sin"] = np.sin(2*np.pi*df["dayofyear"]/365)
df["doy_cos"] = np.cos(2*np.pi*df["dayofyear"]/365)

df["dow_sin"] = np.sin(2*np.pi*df["dayofweek"]/7)
df["dow_cos"] = np.cos(2*np.pi*df["dayofweek"]/7)


# 6. DEFINE COLS
POLLUTANT_COLS = [
    "pm_sepuluh","pm_duakomalima","ozon",
    "nitrogen_dioksida","sulfur_dioksida","karbon_monoksida"
]

# 7. BUILD FORECAST DATASET
def build_forecast_df(df_source, H=30):
    rows = []

    for loc, g in df_source.groupby("lokasi_clean"):
        g = g.sort_values("tanggal").reset_index(drop=True)

        for col in POLLUTANT_COLS:
            g[f"{col}_lag1"] = g[col].shift(0)
            g[f"{col}_lag2"] = g[col].shift(1)
            g[f"{col}_lag3"] = g[col].shift(2)

        for h in range(1, H+1):
            g_future = g.shift(-h)

            temp = pd.DataFrame({
                "lokasi_clean": loc,
                "tanggal": g["tanggal"],
                "horizon": h,
                "month": g["tanggal"].dt.month,
                "dayofyear": g["tanggal"].dt.dayofyear,
                "dayofweek": g["tanggal"].dt.dayofweek,
            })

            for col in POLLUTANT_COLS:
                temp[f"{col}_lag1"] = g[f"{col}_lag1"]
                temp[f"{col}_lag2"] = g[f"{col}_lag2"]
                temp[f"{col}_lag3"] = g[f"{col}_lag3"]
                temp[f"target_{col}"] = g_future[col]

            rows.append(temp)

    return pd.concat(rows, ignore_index=True).dropna().reset_index(drop=True)


df_model = df.reset_index(drop=True)
df_forecast = build_forecast_df(df_model)

# 8. ENCODE LOCATION
le_loc = LabelEncoder()
df_model["lokasi_enc"] = le_loc.fit_transform(df_model["lokasi_clean"])
df_forecast["lokasi_enc"] = le_loc.transform(df_forecast["lokasi_clean"])





  df = df.groupby("lokasi_clean", group_keys=False).apply(reindex_daily)


In [19]:
# 9. TIME SERIES SPLIT
SPLIT_DATE = "2024-01-01"

df_forecast_train = df_forecast[df_forecast["tanggal"] < SPLIT_DATE]
df_forecast_val   = df_forecast[df_forecast["tanggal"] >= SPLIT_DATE]


In [20]:
# 10. TRAIN FORECAST MODELS (EARLY STOPPING)
pollutant_models = {}

FEATURES_F = (
    [f"{c}_lag1" for c in POLLUTANT_COLS] +
    [f"{c}_lag2" for c in POLLUTANT_COLS] +
    [f"{c}_lag3" for c in POLLUTANT_COLS] +
    ["month","dayofyear","dayofweek","lokasi_enc","horizon"]
)

for col in POLLUTANT_COLS:

    y_col = f"target_{col}"

    train_data = lgb.Dataset(
        df_forecast_train[FEATURES_F],
        label=df_forecast_train[y_col]
    )

    val_data = lgb.Dataset(
        df_forecast_val[FEATURES_F],
        label=df_forecast_val[y_col]
    )

    model = lgb.train(
        dict(objective="regression", metric="l2", learning_rate=0.05, seed=42),
        train_data,
        num_boost_round=2000,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)]
    )

    pollutant_models[col] = model


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[146]	valid_0's l2: 190.877
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[159]	valid_0's l2: 420.382
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[127]	valid_0's l2: 111.444
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[59]	valid_0's l2: 265.372
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[84]	valid_0's l2: 98.1821
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[46]	valid_0's l2: 34.3865


In [21]:
# 11. BUILD ISPU LAG
for col in POLLUTANT_COLS:
    for lag in [1,2,3]:
        df_model[f"{col}_lag{lag}"] = df_model.groupby("lokasi_clean")[col].shift(lag)

df_model = df_model.dropna().reset_index(drop=True)

# 12. SPLIT ISPU
train_df = df_model[df_model["tanggal"] < SPLIT_DATE]
val_df   = df_model[df_model["tanggal"] >= SPLIT_DATE]

# 13. FEATURES ISPU
ISPU_FEATURES = (
    POLLUTANT_COLS +
    [f"{c}_lag{l}" for c in POLLUTANT_COLS for l in [1,2,3]] +
    ["month_sin","month_cos","doy_sin","doy_cos","dow_sin","dow_cos","lokasi_enc"]
)

X_train, y_train = train_df[ISPU_FEATURES], train_df["target"]
X_val,   y_val   = val_df[ISPU_FEATURES],   val_df["target"]


In [22]:
# 14. OPTUNA
def objective(trial):

    params = {
        "objective":"multiclass",
        "num_class":3,
        "metric":"multi_logloss",
        "learning_rate":trial.suggest_float("lr",0.01,0.1,log=True),
        "num_leaves":trial.suggest_int("leaves",31,255),
        "min_data_in_leaf":trial.suggest_int("minleaf",10,80),
        "feature_fraction":trial.suggest_float("ff",0.6,1.0),
        "bagging_fraction":trial.suggest_float("bf",0.6,1.0),
        "bagging_freq":trial.suggest_int("bq",1,7),
        "lambda_l1":trial.suggest_float("l1",0,5),
        "lambda_l2":trial.suggest_float("l2",0,5),
        "verbosity":-1,
        "seed":42
    }

    model = lgb.train(
        params,
        lgb.Dataset(X_train, label=y_train),
        num_boost_round=2000,
        valid_sets=[lgb.Dataset(X_val, label=y_val)],
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)]
    )

    pred = model.predict(X_val).argmax(1)
    return f1_score(y_val, pred, average="macro")


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

best_params = study.best_params


[I 2026-02-07 22:13:01,492] A new study created in memory with name: no-name-d47294d5-f099-4e9a-a722-e6bee0089752


Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:13:03,643] Trial 0 finished with value: 0.9178569685331691 and parameters: {'lr': 0.052361280051298184, 'leaves': 145, 'minleaf': 60, 'ff': 0.9063781486199081, 'bf': 0.6097957258752016, 'bq': 1, 'l1': 1.3619214960901982, 'l2': 3.3422280184001125}. Best is trial 0 with value: 0.9178569685331691.


Early stopping, best iteration is:
[96]	valid_0's multi_logloss: 0.176381
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:13:06,908] Trial 1 finished with value: 0.9106729327178229 and parameters: {'lr': 0.01779701704090814, 'leaves': 152, 'minleaf': 76, 'ff': 0.9620189207602411, 'bf': 0.7961915651264743, 'bq': 2, 'l1': 4.575356389596792, 'l2': 2.005877270741618}. Best is trial 0 with value: 0.9178569685331691.


Early stopping, best iteration is:
[328]	valid_0's multi_logloss: 0.178757
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[520]	valid_0's multi_logloss: 0.172085


[I 2026-02-07 22:13:11,039] Trial 2 finished with value: 0.926739729191402 and parameters: {'lr': 0.010366064129489858, 'leaves': 209, 'minleaf': 35, 'ff': 0.9389838481382388, 'bf': 0.6885268611389043, 'bq': 5, 'l1': 4.895023153311403, 'l2': 1.1539673866399258}. Best is trial 2 with value: 0.926739729191402.


Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:13:13,423] Trial 3 finished with value: 0.917991670069816 and parameters: {'lr': 0.022020376748230524, 'leaves': 228, 'minleaf': 58, 'ff': 0.6992253267680759, 'bf': 0.8473295058329571, 'bq': 3, 'l1': 4.854365385509581, 'l2': 2.2640454308921}. Best is trial 2 with value: 0.926739729191402.


Early stopping, best iteration is:
[293]	valid_0's multi_logloss: 0.168313
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:13:17,241] Trial 4 finished with value: 0.916618866082155 and parameters: {'lr': 0.011096422122792786, 'leaves': 46, 'minleaf': 43, 'ff': 0.6679397308064615, 'bf': 0.892127664786305, 'bq': 3, 'l1': 3.2544511558291775, 'l2': 1.2991598825232824}. Best is trial 2 with value: 0.926739729191402.


Early stopping, best iteration is:
[503]	valid_0's multi_logloss: 0.167926
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:13:18,483] Trial 5 finished with value: 0.9178558022970481 and parameters: {'lr': 0.09306403839410783, 'leaves': 246, 'minleaf': 11, 'ff': 0.7865007111131512, 'bf': 0.7644175748965234, 'bq': 2, 'l1': 2.4140471015494396, 'l2': 4.580768960990445}. Best is trial 2 with value: 0.926739729191402.


Early stopping, best iteration is:
[59]	valid_0's multi_logloss: 0.161337
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:13:19,478] Trial 6 finished with value: 0.918887871433264 and parameters: {'lr': 0.06434223754339352, 'leaves': 124, 'minleaf': 23, 'ff': 0.994958985542725, 'bf': 0.737169266818164, 'bq': 4, 'l1': 3.697088432749034, 'l2': 1.208828411008707}. Best is trial 2 with value: 0.926739729191402.


Early stopping, best iteration is:
[67]	valid_0's multi_logloss: 0.169744
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:13:20,704] Trial 7 finished with value: 0.9169430067873362 and parameters: {'lr': 0.043251906702453734, 'leaves': 144, 'minleaf': 78, 'ff': 0.8788271742092932, 'bf': 0.6717742178033673, 'bq': 7, 'l1': 3.0166927028947654, 'l2': 0.5044270477224755}. Best is trial 2 with value: 0.926739729191402.


Early stopping, best iteration is:
[95]	valid_0's multi_logloss: 0.181381
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:13:22,699] Trial 8 finished with value: 0.9140173967487607 and parameters: {'lr': 0.026763203672370336, 'leaves': 49, 'minleaf': 69, 'ff': 0.6269485167095119, 'bf': 0.9685367653098534, 'bq': 2, 'l1': 0.711742269863711, 'l2': 2.0381711184573987}. Best is trial 2 with value: 0.926739729191402.


Early stopping, best iteration is:
[190]	valid_0's multi_logloss: 0.170765
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:13:23,633] Trial 9 finished with value: 0.9225130048627698 and parameters: {'lr': 0.05628571357987679, 'leaves': 228, 'minleaf': 45, 'ff': 0.7362321507677807, 'bf': 0.6754535129887234, 'bq': 2, 'l1': 1.7161286284977844, 'l2': 0.4275817226771328}. Best is trial 2 with value: 0.926739729191402.


Early stopping, best iteration is:
[78]	valid_0's multi_logloss: 0.168639
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:13:27,092] Trial 10 finished with value: 0.9126597091115665 and parameters: {'lr': 0.011405914684535404, 'leaves': 185, 'minleaf': 31, 'ff': 0.8381438293814513, 'bf': 0.6957864614645908, 'bq': 6, 'l1': 0.0668996387454679, 'l2': 3.301235617629706}. Best is trial 2 with value: 0.926739729191402.


Early stopping, best iteration is:
[372]	valid_0's multi_logloss: 0.169171
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:13:28,576] Trial 11 finished with value: 0.9125274204980011 and parameters: {'lr': 0.038618227947835336, 'leaves': 204, 'minleaf': 41, 'ff': 0.7524920519803818, 'bf': 0.6065072720103933, 'bq': 5, 'l1': 2.0454569707156414, 'l2': 0.08139393237177966}. Best is trial 2 with value: 0.926739729191402.


Early stopping, best iteration is:
[165]	valid_0's multi_logloss: 0.170439
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:13:29,589] Trial 12 finished with value: 0.9227217438156955 and parameters: {'lr': 0.08150430269342361, 'leaves': 253, 'minleaf': 31, 'ff': 0.7374118392931446, 'bf': 0.6970485744195558, 'bq': 5, 'l1': 1.8569744920348148, 'l2': 0.9561602976341053}. Best is trial 2 with value: 0.926739729191402.


Early stopping, best iteration is:
[65]	valid_0's multi_logloss: 0.169909
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:13:30,615] Trial 13 finished with value: 0.92310687154421 and parameters: {'lr': 0.08723077356771394, 'leaves': 251, 'minleaf': 28, 'ff': 0.8207105125135601, 'bf': 0.7257388456754158, 'bq': 5, 'l1': 4.022026413281203, 'l2': 1.199376356009031}. Best is trial 2 with value: 0.926739729191402.


Early stopping, best iteration is:
[65]	valid_0's multi_logloss: 0.164569
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:13:33,078] Trial 14 finished with value: 0.92739341310449 and parameters: {'lr': 0.017022830931532375, 'leaves': 188, 'minleaf': 17, 'ff': 0.9157454594261318, 'bf': 0.8324258190934973, 'bq': 5, 'l1': 4.202244806423275, 'l2': 3.0498111120920903}. Best is trial 14 with value: 0.92739341310449.


Early stopping, best iteration is:
[285]	valid_0's multi_logloss: 0.159675
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:13:36,085] Trial 15 finished with value: 0.9271514523964072 and parameters: {'lr': 0.015964779269943156, 'leaves': 182, 'minleaf': 11, 'ff': 0.9252088558551047, 'bf': 0.8564492463061317, 'bq': 7, 'l1': 4.267923058353243, 'l2': 3.276112161032261}. Best is trial 14 with value: 0.92739341310449.


Early stopping, best iteration is:
[294]	valid_0's multi_logloss: 0.159448
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:13:39,025] Trial 16 finished with value: 0.9290604803464734 and parameters: {'lr': 0.015949445118216363, 'leaves': 100, 'minleaf': 12, 'ff': 0.8747169244828533, 'bf': 0.8799745138993791, 'bq': 7, 'l1': 4.0830562933862975, 'l2': 3.16952693286085}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[302]	valid_0's multi_logloss: 0.157472
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:13:42,014] Trial 17 finished with value: 0.9197954461661126 and parameters: {'lr': 0.014873923495855665, 'leaves': 92, 'minleaf': 20, 'ff': 0.8655822822273137, 'bf': 0.9423406159054727, 'bq': 6, 'l1': 3.317182668883643, 'l2': 4.292308110386957}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[311]	valid_0's multi_logloss: 0.162156
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:13:43,737] Trial 18 finished with value: 0.9262531867528049 and parameters: {'lr': 0.027233974031885645, 'leaves': 90, 'minleaf': 18, 'ff': 0.8727803374638469, 'bf': 0.9065967576767345, 'bq': 6, 'l1': 3.844643329064536, 'l2': 3.8279513026553182}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[177]	valid_0's multi_logloss: 0.161697
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:13:45,982] Trial 19 finished with value: 0.9249605924206047 and parameters: {'lr': 0.020074435468239733, 'leaves': 113, 'minleaf': 15, 'ff': 0.9944798488654405, 'bf': 0.8214610557682489, 'bq': 7, 'l1': 2.65902928515206, 'l2': 2.719272008926086}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[226]	valid_0's multi_logloss: 0.162512
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[389]	valid_0's multi_logloss: 0.162633


[I 2026-02-07 22:13:49,527] Trial 20 finished with value: 0.9233434221340019 and parameters: {'lr': 0.013318549777843585, 'leaves': 76, 'minleaf': 25, 'ff': 0.7772003476264638, 'bf': 0.8983641244201601, 'bq': 4, 'l1': 3.592297375853782, 'l2': 2.7649775798140954}. Best is trial 16 with value: 0.9290604803464734.


Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:13:52,411] Trial 21 finished with value: 0.9256454841897557 and parameters: {'lr': 0.015376495878345818, 'leaves': 173, 'minleaf': 10, 'ff': 0.9243380932667838, 'bf': 0.8542831694353279, 'bq': 7, 'l1': 4.245281265071748, 'l2': 3.3907202444385245}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[294]	valid_0's multi_logloss: 0.159419
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:13:55,213] Trial 22 finished with value: 0.9279925100282518 and parameters: {'lr': 0.022894776904380427, 'leaves': 175, 'minleaf': 14, 'ff': 0.8993885714358291, 'bf': 0.8633384350984501, 'bq': 6, 'l1': 4.363889010135235, 'l2': 3.9582496126049254}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[204]	valid_0's multi_logloss: 0.159622
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:13:58,154] Trial 23 finished with value: 0.9228215950276528 and parameters: {'lr': 0.025927664027098896, 'leaves': 165, 'minleaf': 16, 'ff': 0.8929279013614799, 'bf': 0.803509037469093, 'bq': 6, 'l1': 4.5934714623801955, 'l2': 3.978093200279754}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[203]	valid_0's multi_logloss: 0.16106
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:14:01,023] Trial 24 finished with value: 0.9257622563227256 and parameters: {'lr': 0.022197005203652802, 'leaves': 125, 'minleaf': 22, 'ff': 0.8368399949185994, 'bf': 0.933716694289872, 'bq': 6, 'l1': 4.127159304392842, 'l2': 3.862442283517211}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[240]	valid_0's multi_logloss: 0.161846
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:14:02,953] Trial 25 finished with value: 0.913908063608938 and parameters: {'lr': 0.03618092449329361, 'leaves': 200, 'minleaf': 51, 'ff': 0.9662554687427445, 'bf': 0.8773755082835334, 'bq': 4, 'l1': 2.8675654503138883, 'l2': 4.790426364897779}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[147]	valid_0's multi_logloss: 0.173321
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:14:05,863] Trial 26 finished with value: 0.918834409263357 and parameters: {'lr': 0.0191800637904372, 'leaves': 161, 'minleaf': 36, 'ff': 0.8498433951735904, 'bf': 0.8224322097053197, 'bq': 7, 'l1': 4.544700990133802, 'l2': 2.800935529742408}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[320]	valid_0's multi_logloss: 0.167464
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:14:09,015] Trial 27 finished with value: 0.9288698983661634 and parameters: {'lr': 0.012819491359382942, 'leaves': 112, 'minleaf': 17, 'ff': 0.8169652730081783, 'bf': 0.7731918225084373, 'bq': 6, 'l1': 4.957793324336428, 'l2': 4.151864073446361}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[389]	valid_0's multi_logloss: 0.162178
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[613]	valid_0's multi_logloss: 0.163699


[I 2026-02-07 22:14:13,094] Trial 28 finished with value: 0.9233924750053782 and parameters: {'lr': 0.012394204109906026, 'leaves': 68, 'minleaf': 24, 'ff': 0.7941083105306862, 'bf': 0.762913222606884, 'bq': 6, 'l1': 4.861797931166958, 'l2': 4.301549556828078}. Best is trial 16 with value: 0.9290604803464734.


Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:14:15,393] Trial 29 finished with value: 0.924992271157211 and parameters: {'lr': 0.03378927299129299, 'leaves': 107, 'minleaf': 15, 'ff': 0.9025716953315669, 'bf': 0.9915684635182567, 'bq': 7, 'l1': 3.584020788724912, 'l2': 4.972765840747121}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[134]	valid_0's multi_logloss: 0.158606
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[588]	valid_0's multi_logloss: 0.161079


[I 2026-02-07 22:14:19,644] Trial 30 finished with value: 0.9246271318190816 and parameters: {'lr': 0.012827580611910831, 'leaves': 129, 'minleaf': 10, 'ff': 0.803302640924048, 'bf': 0.6368262812956443, 'bq': 6, 'l1': 4.987489716070826, 'l2': 3.6027540883425044}. Best is trial 16 with value: 0.9290604803464734.


Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:14:22,685] Trial 31 finished with value: 0.9276967455277177 and parameters: {'lr': 0.016318760620303945, 'leaves': 132, 'minleaf': 18, 'ff': 0.9021221555662114, 'bf': 0.7754473958845487, 'bq': 5, 'l1': 4.447426266056976, 'l2': 3.1880110208784322}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[304]	valid_0's multi_logloss: 0.162833
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:14:25,450] Trial 32 finished with value: 0.9245183717337925 and parameters: {'lr': 0.014872524774863515, 'leaves': 141, 'minleaf': 28, 'ff': 0.9457293036226003, 'bf': 0.7806495796926037, 'bq': 5, 'l1': 4.520474435736797, 'l2': 4.30273175801882}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[323]	valid_0's multi_logloss: 0.169419
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:14:27,959] Trial 33 finished with value: 0.9244073743342263 and parameters: {'lr': 0.022057203040872234, 'leaves': 102, 'minleaf': 20, 'ff': 0.8933121940338792, 'bf': 0.7957267316601961, 'bq': 6, 'l1': 4.580603234816762, 'l2': 3.551471683642049}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[240]	valid_0's multi_logloss: 0.161848
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:14:30,981] Trial 34 finished with value: 0.9261884734317579 and parameters: {'lr': 0.01848850109117939, 'leaves': 134, 'minleaf': 14, 'ff': 0.8585584990597652, 'bf': 0.7530921352429817, 'bq': 1, 'l1': 3.893377196424696, 'l2': 3.0292268840574934}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[278]	valid_0's multi_logloss: 0.163787
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[540]	valid_0's multi_logloss: 0.170084


[I 2026-02-07 22:14:35,360] Trial 35 finished with value: 0.9185726714099993 and parameters: {'lr': 0.010179402458683044, 'leaves': 72, 'minleaf': 37, 'ff': 0.827572465789008, 'bf': 0.8718908012911887, 'bq': 4, 'l1': 4.427371597401826, 'l2': 2.3729773834077643}. Best is trial 16 with value: 0.9290604803464734.


Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:14:37,603] Trial 36 finished with value: 0.9190852947471767 and parameters: {'lr': 0.02416913686375747, 'leaves': 154, 'minleaf': 27, 'ff': 0.9609970938116006, 'bf': 0.9143712829018316, 'bq': 5, 'l1': 4.774965781828307, 'l2': 4.007822813422363}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[215]	valid_0's multi_logloss: 0.165673
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:14:39,798] Trial 37 finished with value: 0.9175604600909789 and parameters: {'lr': 0.013905407129293858, 'leaves': 118, 'minleaf': 65, 'ff': 0.880694296565255, 'bf': 0.7946564856358788, 'bq': 7, 'l1': 3.3307424207844796, 'l2': 1.654205098264322}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[322]	valid_0's multi_logloss: 0.173954
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:14:42,135] Trial 38 finished with value: 0.920610107708885 and parameters: {'lr': 0.017173184788537705, 'leaves': 91, 'minleaf': 52, 'ff': 0.8072226842464817, 'bf': 0.7315225514193486, 'bq': 6, 'l1': 4.75544806288487, 'l2': 4.520477315008764}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[343]	valid_0's multi_logloss: 0.169741
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:14:44,396] Trial 39 finished with value: 0.9236703886778547 and parameters: {'lr': 0.03000817250729747, 'leaves': 56, 'minleaf': 20, 'ff': 0.9441157310258007, 'bf': 0.8401973645835443, 'bq': 5, 'l1': 3.8527989455467444, 'l2': 3.646220152284072}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[161]	valid_0's multi_logloss: 0.162403
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[404]	valid_0's multi_logloss: 0.160184


[I 2026-02-07 22:14:49,318] Trial 40 finished with value: 0.9267180200805513 and parameters: {'lr': 0.012008468946650589, 'leaves': 149, 'minleaf': 13, 'ff': 0.9067159788001019, 'bf': 0.8774552001801604, 'bq': 3, 'l1': 4.998959015684841, 'l2': 3.145315687613186}. Best is trial 16 with value: 0.9290604803464734.


Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:14:52,976] Trial 41 finished with value: 0.9223346458085587 and parameters: {'lr': 0.016985479212705126, 'leaves': 221, 'minleaf': 17, 'ff': 0.9193707954807363, 'bf': 0.8362342853131254, 'bq': 5, 'l1': 4.31385764332523, 'l2': 2.9703614333060036}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[285]	valid_0's multi_logloss: 0.160946
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:14:56,290] Trial 42 finished with value: 0.9273192601981591 and parameters: {'lr': 0.019887617414162825, 'leaves': 187, 'minleaf': 18, 'ff': 0.8496151289737288, 'bf': 0.8138228440053367, 'bq': 4, 'l1': 4.131074261702986, 'l2': 1.996934937573466}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[250]	valid_0's multi_logloss: 0.159779
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:14:59,505] Trial 43 finished with value: 0.920245319670383 and parameters: {'lr': 0.0165035193927162, 'leaves': 175, 'minleaf': 32, 'ff': 0.9669182930416582, 'bf': 0.7650224471004102, 'bq': 5, 'l1': 3.5440199521650144, 'l2': 2.5519272659598684}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[275]	valid_0's multi_logloss: 0.170226
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[390]	valid_0's multi_logloss: 0.164118


[I 2026-02-07 22:15:04,944] Trial 44 finished with value: 0.9185354874269026 and parameters: {'lr': 0.011139998510771592, 'leaves': 193, 'minleaf': 23, 'ff': 0.9119263451139732, 'bf': 0.7840358596788407, 'bq': 6, 'l1': 1.0407448734661324, 'l2': 4.093469894044364}. Best is trial 16 with value: 0.9290604803464734.


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[385]	valid_0's multi_logloss: 0.158228


[I 2026-02-07 22:15:10,338] Trial 45 finished with value: 0.9253457922629355 and parameters: {'lr': 0.013829007516833886, 'leaves': 213, 'minleaf': 13, 'ff': 0.8750158044349942, 'bf': 0.9238396013502623, 'bq': 5, 'l1': 4.360385259820776, 'l2': 3.4018883384246745}. Best is trial 16 with value: 0.9290604803464734.


Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:15:12,760] Trial 46 finished with value: 0.9138527932635941 and parameters: {'lr': 0.021181993981453394, 'leaves': 137, 'minleaf': 74, 'ff': 0.6670555039990896, 'bf': 0.8607229061358425, 'bq': 6, 'l1': 3.0971754786360273, 'l2': 3.6874479270023994}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[324]	valid_0's multi_logloss: 0.170398
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:15:17,132] Trial 47 finished with value: 0.9257776564429135 and parameters: {'lr': 0.018067834488076272, 'leaves': 103, 'minleaf': 12, 'ff': 0.7745810973405648, 'bf': 0.8331405063016275, 'bq': 7, 'l1': 2.3446720509647028, 'l2': 2.0694320959163734}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[258]	valid_0's multi_logloss: 0.158064
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:15:19,405] Trial 48 finished with value: 0.9253168350872732 and parameters: {'lr': 0.023812874628638316, 'leaves': 161, 'minleaf': 20, 'ff': 0.9317430284789722, 'bf': 0.7149227022142729, 'bq': 5, 'l1': 4.721773059389308, 'l2': 3.1387936943882218}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[205]	valid_0's multi_logloss: 0.166231
Training until validation scores don't improve for 100 rounds


[I 2026-02-07 22:15:22,193] Trial 49 finished with value: 0.9196978202914442 and parameters: {'lr': 0.029363419740075423, 'leaves': 118, 'minleaf': 27, 'ff': 0.8866523414647521, 'bf': 0.8885036235513852, 'bq': 7, 'l1': 4.016238566600025, 'l2': 2.540715773026972}. Best is trial 16 with value: 0.9290604803464734.


Early stopping, best iteration is:
[175]	valid_0's multi_logloss: 0.166632


In [23]:
# 15. TRAIN FINAL MODEL
best_params.update(dict(
    objective="multiclass",
    num_class=3,
    metric="multi_logloss",   # ← JANGAN "None"
    seed=42
))

ispu_model = lgb.train(
    best_params,
    lgb.Dataset(X_train, label=y_train),
    num_boost_round=2000,
    valid_sets=[lgb.Dataset(X_val, label=y_val)],
    callbacks=[lgb.early_stopping(100), lgb.log_evaluation(50)]
)


Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_logloss: 0.198149
[100]	valid_0's multi_logloss: 0.264381
Early stopping, best iteration is:
[26]	valid_0's multi_logloss: 0.187607


In [24]:
# THRESHOLD
probs_val = ispu_model.predict(X_val)

best_t = [0.33, 0.33, 0.33]

for c in range(3):
    best_f1 = 0
    for t in np.linspace(0.2, 0.6, 25):
        pred = probs_val.argmax(1)
        pred[probs_val[:, c] > t] = c
        f1 = f1_score(y_val, pred, average="macro")

        if f1 > best_f1:
            best_f1 = f1
            best_t[c] = t

print("Best threshold:", best_t)


Best threshold: [0.48333333333333334, 0.21666666666666667, 0.21666666666666667]


In [25]:
# FORECAST TEST
sub = pd.read_csv("sample_submission.csv")

sub["tanggal"] = pd.to_datetime(sub["id"].str.split("_").str[0])
sub["lokasi_clean"] = sub["id"].str.split("_").str[1]
sub = sub.sort_values(["lokasi_clean", "tanggal"]).reset_index(drop=True)

sub["lokasi_enc"] = le_loc.transform(sub["lokasi_clean"])

last_dates = df_model.groupby("lokasi_clean")["tanggal"].max().to_dict()

forecast_rows = []

for loc in sub["lokasi_clean"].unique():

    hist = (
        df_model[df_model["lokasi_clean"] == loc]
        .sort_values("tanggal")
        .iloc[-3:][POLLUTANT_COLS]
        .values.tolist()
    )

    future_rows = sub[sub["lokasi_clean"] == loc].sort_values("tanggal")

    for _, row in future_rows.iterrows():

        tgl = row["tanggal"]

        h = (tgl - last_dates[loc]).days
        h = max(1, min(30, h))

        feat = {
            "horizon": h,
            "month": tgl.month,
            "dayofyear": tgl.dayofyear,
            "dayofweek": tgl.dayofweek,
            "lokasi_enc": row["lokasi_enc"],
        }

        for i, col in enumerate(POLLUTANT_COLS):
            feat[f"{col}_lag1"] = hist[-1][i]
            feat[f"{col}_lag2"] = hist[-2][i]
            feat[f"{col}_lag3"] = hist[-3][i]

        Xf = pd.DataFrame([feat])

        new_vals = []
        preds_pol = {}

        for i, col in enumerate(POLLUTANT_COLS):
            pred = pollutant_models[col].predict(Xf)[0]
            new_vals.append(pred)
            preds_pol[col] = pred

        hist.append(new_vals)
        hist.pop(0)

        feat.update(preds_pol)

        for i, col in enumerate(POLLUTANT_COLS):
            feat[f"{col}_lag1"] = hist[-1][i]
            feat[f"{col}_lag2"] = hist[-2][i]
            feat[f"{col}_lag3"] = hist[-3][i]

        feat["id"] = row["id"]
        forecast_rows.append(feat)

df_future = pd.DataFrame(forecast_rows)


In [26]:
# ===============================
# PREDIKSI ISPU (FINAL FIX)
# ===============================
results = []

for _, row in df_future.iterrows():

    feat_ispu = {col: row[col] for col in POLLUTANT_COLS}

    for col in POLLUTANT_COLS:
        feat_ispu[f"{col}_lag1"] = row[f"{col}_lag1"]
        feat_ispu[f"{col}_lag2"] = row[f"{col}_lag2"]
        feat_ispu[f"{col}_lag3"] = row[f"{col}_lag3"]

    month = row["month"]
    doy   = row["dayofyear"]
    dow   = row["dayofweek"]

    feat_ispu.update({
        "month_sin": np.sin(2*np.pi*month/12),
        "month_cos": np.cos(2*np.pi*month/12),
        "doy_sin": np.sin(2*np.pi*doy/365),
        "doy_cos": np.cos(2*np.pi*doy/365),
        "dow_sin": np.sin(2*np.pi*dow/7),
        "dow_cos": np.cos(2*np.pi*dow/7),
        "lokasi_enc": row["lokasi_enc"],
    })

    Xi = pd.DataFrame([feat_ispu])

    prob = ispu_model.predict(Xi)[0]

    # ===============================
    # THRESHOLD DECISION (BENAR)
    # ===============================
    adj_prob = prob - best_t
    pred = int(np.argmax(adj_prob))
    results.append(pred)


# ===============================
# BUILD SUBMISSION
# ===============================
INV_LABEL_MAP = {0: "BAIK", 1: "SEDANG", 2: "TIDAK SEHAT"}

sub_final = pd.read_csv("sample_submission.csv")

pred_df = df_future[["id"]].copy()
pred_df["kategori"] = [INV_LABEL_MAP[i] for i in results]

sub_final = sub_final.merge(pred_df, on="id", how="left")
sub_final.to_csv("submission_final.csv", index=False)

print("✅ submission_final.csv siap upload")


✅ submission_final.csv siap upload


In [27]:
train_df["target"].value_counts(normalize=True)


target
1.0    0.655833
2.0    0.188702
0.0    0.155465
Name: proportion, dtype: float64

In [28]:
probs_val = ispu_model.predict(X_val)
pred_val = probs_val.argmax(1)

print("Macro-F1 VAL:", f1_score(y_val, pred_val, average="macro"))


Macro-F1 VAL: 0.9012628016693925


In [29]:
probs_val = ispu_model.predict(X_val)
pred_val = probs_val.argmax(1)

print("Macro-F1:", f1_score(y_val, pred_val, average="macro"))
print(pd.Series(pred_val).value_counts(normalize=True))


Macro-F1: 0.9012628016693925
1    0.749917
0    0.152626
2    0.097456
Name: proportion, dtype: float64


In [None]:
for col in POLLUTANT_COLS:
    print(col, "std =", df_future[col].std())


pm_sepuluh std = 2.2983254802601243
pm_duakomalima std = 0.9432476586281882
ozon std = 7.027937012360093
nitrogen_dioksida std = 0.35573687418532174
sulfur_dioksida std = 0.26740948268988
karbon_monoksida std = 1.0674138541566285


: 