In [None]:
# =========================================================
# FINAL BOSS SINGLE-CELL NOTEBOOK
# PER-LOCATION + TEMPORAL ENSEMBLE + STACKING META MODEL
# =========================================================

import pandas as pd
import numpy as np
import lightgbm as lgb
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

SEED = 42
np.random.seed(SEED)

# ===============================
# LOAD & CLEAN
# ===============================
df = pd.read_csv("merged_libur_cuaca_ispu_ndvi.csv", sep=";")
df["tanggal"] = pd.to_datetime(df["tanggal"], dayfirst=True)
df = df.sort_values(["lokasi_clean","tanggal"]).reset_index(drop=True)
df = df.drop(columns=["max","parameter_pencemar_kritis","time","id","stasiun"], errors="ignore")

LABEL_MAP = {"BAIK":0,"SEDANG":1,"TIDAK SEHAT":2}
INV_LABEL_MAP = {0:"BAIK",1:"SEDANG",2:"TIDAK SEHAT"}

df = df[df["kategori"].notna()]
df["target"] = df["kategori"].map(LABEL_MAP).astype(int)

# ===============================
# REINDEX DAILY
# ===============================
def reindex_daily(g):
    idx = pd.date_range(g["tanggal"].min(), g["tanggal"].max(), freq="D")
    g = g.set_index("tanggal").reindex(idx)
    g["lokasi_clean"] = g["lokasi_clean"].iloc[0]
    return g.reset_index().rename(columns={"index":"tanggal"})

df = df.groupby("lokasi_clean", group_keys=False).apply(reindex_daily)
df = df.sort_values(["lokasi_clean","tanggal"]).reset_index(drop=True)

# ===============================
# TIME FEATURES
# ===============================
df["month_sin"] = np.sin(2*np.pi*df["tanggal"].dt.month/12)
df["month_cos"] = np.cos(2*np.pi*df["tanggal"].dt.month/12)
df["doy_sin"]   = np.sin(2*np.pi*df["tanggal"].dt.dayofyear/365)
df["doy_cos"]   = np.cos(2*np.pi*df["tanggal"].dt.dayofyear/365)
df["dow_sin"]   = np.sin(2*np.pi*df["tanggal"].dt.dayofweek/7)
df["dow_cos"]   = np.cos(2*np.pi*df["tanggal"].dt.dayofweek/7)

# ===============================
# FEATURE BUILDERS
# ===============================
def build_features(g, mode):
    g = g.sort_values("tanggal")
    if mode == "short":
        lags, rolls = [1,2,3], [3]
    else:
        lags, rolls = [3,7,14], [7,14]

    for l in lags:
        g[f"ispu_lag_{l}"] = g["target"].shift(l)
    for r in rolls:
        g[f"ispu_roll_mean_{r}"] = g["target"].shift(1).rolling(r).mean()
        g[f"ispu_roll_std_{r}"]  = g["target"].shift(1).rolling(r).std()
    return g

# ===============================
# COLLECT META TRAINING DATA
# ===============================
meta_rows = []
base_models = {}

for loc, g in df.groupby("lokasi_clean"):
    g = g.copy()
    base_models[loc] = {}

    for mode in ["short","long"]:
        g_feat = build_features(g.copy(), mode)

        H = 30
        rows = []
        for h in range(1, H+1):
            temp = g_feat.copy()
            temp["horizon"] = h
            temp["target_h"] = temp["target"].shift(-h)
            rows.append(temp)

        data = pd.concat(rows).dropna().reset_index(drop=True)

        FEATURES = [c for c in data.columns if c.startswith("ispu_")] + [
            "month_sin","month_cos","doy_sin","doy_cos","dow_sin","dow_cos","horizon"
        ]

        X = data[FEATURES]
        y = data["target_h"].astype(int)

        tscv = TimeSeriesSplit(n_splits=5)
        oof_pred = np.zeros((len(X),3))

        for tr_idx, val_idx in tscv.split(X):
            X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
            y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

            counts = y_tr.value_counts()
            weights = y_tr.map(lambda x: counts.sum()/counts[x])

            dtrain = lgb.Dataset(X_tr, label=y_tr, weight=weights)

            model = lgb.train(
                {
                    "objective":"multiclass",
                    "num_class":3,
                    "learning_rate":0.03,
                    "num_leaves":127,
                    "min_data_in_leaf":20,
                    "verbosity":-1,
                    "seed":SEED
                },
                dtrain,
                num_boost_round=500
            )

            oof_pred[val_idx] = model.predict(X_val)

        # store final trained model
        final_model = lgb.train(
            {
                "objective":"multiclass",
                "num_class":3,
                "learning_rate":0.03,
                "num_leaves":127,
                "min_data_in_leaf":20,
                "verbosity":-1,
                "seed":SEED
            },
            lgb.Dataset(X, label=y),
            num_boost_round=600
        )

        base_models[loc][mode] = final_model

        for i in range(len(data)):
            meta_rows.append({
                "loc": loc,
                "p0": oof_pred[i,0],
                "p1": oof_pred[i,1],
                "p2": oof_pred[i,2],
                "mode": mode,
                "horizon": data.iloc[i]["horizon"],
                "month_sin": data.iloc[i]["month_sin"],
                "doy_sin": data.iloc[i]["doy_sin"],
                "target": data.iloc[i]["target_h"]
            })

# ===============================
# TRAIN META MODEL
# ===============================
meta_df = pd.DataFrame(meta_rows)

meta_df = pd.get_dummies(meta_df, columns=["mode"])
META_FEATURES = [c for c in meta_df.columns if c not in ["target","loc"]]

dmeta = lgb.Dataset(meta_df[META_FEATURES], label=meta_df["target"])

meta_model = lgb.train(
    {
        "objective":"multiclass",
        "num_class":3,
        "learning_rate":0.05,
        "num_leaves":63,
        "min_data_in_leaf":50,
        "verbosity":-1,
        "seed":SEED
    },
    dmeta,
    num_boost_round=500
)

# ===============================
# INFERENCE (STACKED)
# ===============================
sub = pd.read_csv("sample_submission.csv")
sub["tanggal"] = pd.to_datetime(sub["id"].str.split("_").str[0])
sub["lokasi_clean"] = sub["id"].str.split("_").str[1]
sub = sub.sort_values(["lokasi_clean","tanggal"]).reset_index(drop=True)

results = []

for loc, gsub in sub.groupby("lokasi_clean"):
    hist = df[df["lokasi_clean"] == loc].sort_values("tanggal").iloc[-20:]
    hist_ispu = hist["target"].tolist()

    for _, row in gsub.iterrows():
        tgl = row["tanggal"]
        meta_input = []

        for mode in ["short","long"]:
            if mode == "short":
                lags, rolls = [1,2,3],[3]
            else:
                lags, rolls = [3,7,14],[7,14]

            feat = {
                "month_sin": np.sin(2*np.pi*tgl.month/12),
                "month_cos": np.cos(2*np.pi*tgl.month/12),
                "doy_sin": np.sin(2*np.pi*tgl.dayofyear/365),
                "doy_cos": np.cos(2*np.pi*tgl.dayofyear/365),
                "dow_sin": np.sin(2*np.pi*tgl.dayofweek/7),
                "dow_cos": np.cos(2*np.pi*tgl.dayofweek/7),
                "horizon": 1
            }

            for l in lags:
                feat[f"ispu_lag_{l}"] = hist_ispu[-l]
            for r in rolls:
                feat[f"ispu_roll_mean_{r}"] = np.mean(hist_ispu[-r:])
                feat[f"ispu_roll_std_{r}"]  = np.std(hist_ispu[-r:])

            Xi = pd.DataFrame([feat])
            prob = base_models[loc][mode].predict(Xi)[0]

            meta_input.append({
                "p0":prob[0],"p1":prob[1],"p2":prob[2],
                "mode":mode,
                "horizon":1,
                "month_sin":feat["month_sin"],
                "doy_sin":feat["doy_sin"]
            })

        meta_in = pd.DataFrame(meta_input)
        meta_in = pd.get_dummies(meta_in, columns=["mode"])
        meta_in = meta_in.reindex(columns=META_FEATURES, fill_value=0)

        final_pred = meta_model.predict(meta_in).mean(axis=0).argmax()
        hist_ispu.append(final_pred)
        hist_ispu.pop(0)
        results.append(final_pred)

# ===============================
# EXPORT
# ===============================
sub["kategori"] = [INV_LABEL_MAP[i] for i in results]
sub[["id","kategori"]].to_csv("submission_FINAL_BOSS.csv", index=False)

print("ðŸ‘‘ FINAL BOSS CLEARED. submission_FINAL_BOSS.csv GENERATED.")


ðŸ”¥ FINAL BOSS FIXED GENERATED: submission_FINAL_BOSS_FIXED.csv
