In [2]:
# =========================================================
# FINAL SINGLE-CELL NOTEBOOK
# DIRECT ISPU MULTI-HORIZON (MENTOK VERSION)
# =========================================================

import pandas as pd
import numpy as np
import lightgbm as lgb
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

SEED = 42
np.random.seed(SEED)

# ===============================
# LOAD DATA
# ===============================
PATH = Path("merged_libur_cuaca_ispu_ndvi.csv")
df = pd.read_csv(PATH, sep=";")
df["tanggal"] = pd.to_datetime(df["tanggal"], dayfirst=True)
df = df.sort_values(["lokasi_clean", "tanggal"]).reset_index(drop=True)

DROP_COLS = ["max", "parameter_pencemar_kritis", "time", "id", "stasiun"]
df = df.drop(columns=DROP_COLS, errors="ignore")

# ===============================
# TARGET ENCODING
# ===============================
LABEL_MAP = {"BAIK":0, "SEDANG":1, "TIDAK SEHAT":2}
INV_LABEL_MAP = {0:"BAIK", 1:"SEDANG", 2:"TIDAK SEHAT"}

df = df[df["kategori"].notna()].copy()
df["target"] = df["kategori"].map(LABEL_MAP).astype(int)

# ===============================
# REINDEX DAILY
# ===============================
def reindex_daily(g):
    idx = pd.date_range(g["tanggal"].min(), g["tanggal"].max(), freq="D")
    g = g.set_index("tanggal").reindex(idx)
    g["lokasi_clean"] = g["lokasi_clean"].iloc[0]
    return g.reset_index().rename(columns={"index":"tanggal"})

df = df.groupby("lokasi_clean", group_keys=False).apply(reindex_daily)
df = df.sort_values(["lokasi_clean","tanggal"]).reset_index(drop=True)

# ===============================
# TIME FEATURES
# ===============================
df["month"] = df["tanggal"].dt.month
df["dayofyear"] = df["tanggal"].dt.dayofyear
df["dayofweek"] = df["tanggal"].dt.dayofweek

df["month_sin"] = np.sin(2*np.pi*df["month"]/12)
df["month_cos"] = np.cos(2*np.pi*df["month"]/12)
df["doy_sin"]   = np.sin(2*np.pi*df["dayofyear"]/365)
df["doy_cos"]   = np.cos(2*np.pi*df["dayofyear"]/365)
df["dow_sin"]   = np.sin(2*np.pi*df["dayofweek"]/7)
df["dow_cos"]   = np.cos(2*np.pi*df["dayofweek"]/7)

# ===============================
# ISPU LAG & ROLLING (CORE)
# ===============================
def ispu_features(g):
    g = g.sort_values("tanggal")
    for l in [1,2,3,5,7]:
        g[f"ispu_lag_{l}"] = g["target"].shift(l)
    for w in [3,7,14]:
        g[f"ispu_roll_mean_{w}"] = g["target"].shift(1).rolling(w).mean()
        g[f"ispu_roll_std_{w}"]  = g["target"].shift(1).rolling(w).std()
    return g

df = df.groupby("lokasi_clean", group_keys=False).apply(ispu_features)

# ===============================
# POLUTANT CONTEXT (SECONDARY)
# ===============================
POLLUTANT_COLS = [
    "pm_sepuluh","pm_duakomalima","ozon",
    "nitrogen_dioksida","sulfur_dioksida","karbon_monoksida"
]

for col in POLLUTANT_COLS:
    for l in [1,2,3]:
        df[f"{col}_lag_{l}"] = df[col].shift(l)

# ===============================
# LOCATION ENCODING
# ===============================
le_loc = LabelEncoder()
df["lokasi_enc"] = le_loc.fit_transform(df["lokasi_clean"])

# ===============================
# MULTI-HORIZON SUPERVISION
# ===============================
H = 30
rows = []

for loc, g in df.groupby("lokasi_clean"):
    g = g.sort_values("tanggal").reset_index(drop=True)
    for h in range(1, H+1):
        temp = g.copy()
        temp["horizon"] = h
        temp["target_h"] = temp["target"].shift(-h)
        rows.append(temp)

train_df = pd.concat(rows, ignore_index=True)
train_df = train_df.dropna(subset=["target_h"]).reset_index(drop=True)
train_df["target_h"] = train_df["target_h"].astype(int)

# ===============================
# FEATURE LIST
# ===============================
FEATURES = (
    [c for c in train_df.columns if c.startswith("ispu_")]
    + [c for c in train_df.columns if c.endswith("_lag_1") or c.endswith("_lag_2") or c.endswith("_lag_3")]
    + ["month_sin","month_cos","doy_sin","doy_cos","dow_sin","dow_cos","lokasi_enc","horizon"]
)

# ===============================
# COST-SENSITIVE WEIGHT
# ===============================
counts = train_df["target_h"].value_counts().sort_index()
class_weights = {i: counts.sum()/counts[i] for i in counts.index}
weights = train_df["target_h"].map(class_weights)

# ===============================
# TRAIN MODEL
# ===============================
X = train_df[FEATURES]
y = train_df["target_h"]

train_data = lgb.Dataset(X, label=y, weight=weights)

params = {
    "objective": "multiclass",
    "num_class": 3,
    "learning_rate": 0.025,
    "num_leaves": 191,
    "min_data_in_leaf": 20,
    "feature_fraction": 0.85,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "lambda_l1": 0.5,
    "lambda_l2": 1.0,
    "verbosity": -1,
    "seed": SEED
}

model = lgb.train(params, train_data, num_boost_round=1200)

# ===============================
# INFERENCE
# ===============================
sub = pd.read_csv("sample_submission.csv")
sub["tanggal"] = pd.to_datetime(sub["id"].str.split("_").str[0])
sub["lokasi_clean"] = sub["id"].str.split("_").str[1]
sub = sub.sort_values(["lokasi_clean","tanggal"]).reset_index(drop=True)
sub["lokasi_enc"] = le_loc.transform(sub["lokasi_clean"])

results = []

for loc, gsub in sub.groupby("lokasi_clean"):
    hist = df[df["lokasi_clean"] == loc].sort_values("tanggal").iloc[-14:]
    hist_ispu = hist["target"].tolist()

    for _, row in gsub.iterrows():
        tgl = row["tanggal"]

        feat = {
            "month_sin": np.sin(2*np.pi*tgl.month/12),
            "month_cos": np.cos(2*np.pi*tgl.month/12),
            "doy_sin": np.sin(2*np.pi*tgl.dayofyear/365),
            "doy_cos": np.cos(2*np.pi*tgl.dayofyear/365),
            "dow_sin": np.sin(2*np.pi*tgl.dayofweek/7),
            "dow_cos": np.cos(2*np.pi*tgl.dayofweek/7),
            "lokasi_enc": row["lokasi_enc"],
            "horizon": 1
        }

        for l in [1,2,3,5,7]:
            feat[f"ispu_lag_{l}"] = hist_ispu[-l]

        feat["ispu_roll_mean_3"]  = np.mean(hist_ispu[-3:])
        feat["ispu_roll_std_3"]   = np.std(hist_ispu[-3:])
        feat["ispu_roll_mean_7"]  = np.mean(hist_ispu[-7:])
        feat["ispu_roll_std_7"]   = np.std(hist_ispu[-7:])
        feat["ispu_roll_mean_14"] = np.mean(hist_ispu[-14:])
        feat["ispu_roll_std_14"]  = np.std(hist_ispu[-14:])

        Xi = pd.DataFrame([feat])
        pred = model.predict(Xi).argmax(1)[0]

        hist_ispu.append(pred)
        hist_ispu.pop(0)
        results.append(pred)

# ===============================
# EXPORT
# ===============================
sub_final = sub.copy()
sub_final["kategori"] = [INV_LABEL_MAP[i] for i in results]
sub_final[["id","kategori"]].to_csv("last_day1l.csv", index=False)

print("âœ… DONE. last_day1l.csv generated.")

LightGBMError: Feature (ispu_lag_1) appears more than one time.