In [2]:
# =========================================================
# 0. IMPORT
# =========================================================
import pandas as pd
import numpy as np
import lightgbm as lgb
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

SEED = 42
np.random.seed(SEED)

# =========================================================
# 1. LOAD & BASIC CLEANING
# =========================================================
PATH = Path("merged_libur_cuaca_ispu_ndvi.csv")
df = pd.read_csv(PATH, sep=";")

df["tanggal"] = pd.to_datetime(df["tanggal"], dayfirst=True)
df = df.sort_values(["lokasi_clean", "tanggal"]).reset_index(drop=True)

DROP_COLS = ["max", "parameter_pencemar_kritis", "time", "id", "stasiun"]
df = df.drop(columns=DROP_COLS, errors="ignore")

LABEL_MAP = {"BAIK": 0, "SEDANG": 1, "TIDAK SEHAT": 2}
df = df[df["kategori"].notna()].copy()
df["target"] = df["kategori"].map(LABEL_MAP).astype(int)

# =========================================================
# 2. REINDEX DAILY PER LOKASI
# =========================================================
def reindex_daily(g):
    idx = pd.date_range(g["tanggal"].min(), g["tanggal"].max(), freq="D")
    g = g.set_index("tanggal").reindex(idx)
    g["lokasi_clean"] = g["lokasi_clean"].iloc[0]
    return g.reset_index().rename(columns={"index": "tanggal"})

df = df.groupby("lokasi_clean", group_keys=False).apply(reindex_daily)

# =========================================================
# 3. PM2.5 IMPUTATION + FLAG
# =========================================================
df["pm25_missing"] = df["pm_duakomalima"].isna().astype(int)

median_pm25 = (
    df[df["tanggal"] >= "2021-01-01"]
    .groupby("lokasi_clean")["pm_duakomalima"]
    .median()
)

df["pm_duakomalima"] = df["pm_duakomalima"].fillna(
    df["lokasi_clean"].map(median_pm25)
)

df = df[df["target"].notna()].copy()

# =========================================================
# 4. TIME FEATURES
# =========================================================
df["month"] = df["tanggal"].dt.month
df["dayofyear"] = df["tanggal"].dt.dayofyear
df["dayofweek"] = df["tanggal"].dt.dayofweek

df["month_sin"] = np.sin(2*np.pi*df["month"]/12)
df["month_cos"] = np.cos(2*np.pi*df["month"]/12)
df["doy_sin"] = np.sin(2*np.pi*df["dayofyear"]/365)
df["doy_cos"] = np.cos(2*np.pi*df["dayofyear"]/365)

# =========================================================
# 5. LAG + ROLLING + TREND FEATURES
# =========================================================
POLLUTANT_COLS = [
    "pm_sepuluh","pm_duakomalima","ozon",
    "nitrogen_dioksida","sulfur_dioksida","karbon_monoksida",
]

LAGS = [1,2,3,7]
ROLL = [3,7,14]

def create_features(g):
    g = g.sort_values("tanggal")

    for col in POLLUTANT_COLS:
        for l in LAGS:
            g[f"{col}_lag{l}"] = g[col].shift(l)

        for w in ROLL:
            g[f"{col}_rmean{w}"] = g[col].shift(1).rolling(w).mean()
            g[f"{col}_rstd{w}"]  = g[col].shift(1).rolling(w).std()

        # trend / delta
        g[f"{col}_diff1"] = g[col].diff(1)
        g[f"{col}_diff7"] = g[col].diff(7)

    return g

df = df.groupby("lokasi_clean", group_keys=False).apply(create_features)

# =========================================================
# 6. DROP NA FROM LAGGING
# =========================================================
df_model = df.dropna().reset_index(drop=True)

# =========================================================
# 7. ENCODE LOKASI
# =========================================================
le_loc = LabelEncoder()
df_model["lokasi_enc"] = le_loc.fit_transform(df_model["lokasi_clean"])

# =========================================================
# 8. TIME-BASED SPLIT (NO LEAKAGE)
# =========================================================
split_date = df_model["tanggal"].quantile(0.8)

train = df_model[df_model["tanggal"] <= split_date]
valid = df_model[df_model["tanggal"] > split_date]

# =========================================================
# 9. TRAIN ISPU MULTICLASS MODEL
# =========================================================
FEATURES = (
    [c for c in df_model.columns if any(x in c for x in ["lag","rmean","rstd","diff"])]
    + ["month_sin","month_cos","doy_sin","doy_cos","lokasi_enc"]
)

X_train, y_train = train[FEATURES], train["target"]
X_valid, y_valid = valid[FEATURES], valid["target"]

train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid)

params = {
    "objective": "multiclass",
    "num_class": 3,
    "metric": "multi_logloss",
    "learning_rate": 0.03,
    "num_leaves": 63,
    "min_data_in_leaf": 40,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "class_weight": "balanced",
    "seed": SEED,
    "verbosity": -1,
}

model = lgb.train(
    params,
    train_data,
    valid_sets=[valid_data],
    num_boost_round=2000,
    callbacks=[lgb.early_stopping(200, verbose=False)]
)

# =========================================================
# 10. THRESHOLD TUNING (VALIDATION ONLY)
# =========================================================
probs = model.predict(X_valid)

best_t = [0.33,0.33,0.33]

for c in range(3):
    best_f1 = 0
    for t in np.linspace(0.2,0.7,40):
        pred = probs.argmax(1)
        pred[probs[:,c] > t] = c
        f1 = f1_score(y_valid, pred, average="macro")
        if f1 > best_f1:
            best_f1, best_t[c] = f1, t

print("Best threshold:", best_t)
print("Valid Macro-F1:", best_f1)

# =========================================================
# 11. INFERENCE → SUBMISSION
# =========================================================
sub = pd.read_csv("sample_submission.csv")

sub["tanggal"] = pd.to_datetime(sub["id"].str.split("_").str[0])
sub["lokasi_clean"] = sub["id"].str.split("_").str[1]
sub["lokasi_enc"] = le_loc.transform(sub["lokasi_clean"])

# ambil history terakhir tiap lokasi
history = (
    df_model.sort_values("tanggal")
    .groupby("lokasi_clean")
    .tail(14)
)

rows = []

for _, r in sub.iterrows():

    loc_hist = history[history["lokasi_clean"] == r["lokasi_clean"]].copy()

    feat = loc_hist.iloc[-1:][FEATURES].copy()
    feat["lokasi_enc"] = r["lokasi_enc"]

    prob = model.predict(feat)
    pred = prob.argmax(1)

    for c,t in enumerate(best_t):
        pred[prob[:,c] > t] = c

    rows.append(int(pred[0]))

INV_LABEL = {0:"BAIK",1:"SEDANG",2:"TIDAK SEHAT"}

sub["kategori"] = [INV_LABEL[i] for i in rows]
sub[["id","kategori"]].to_csv("submission_final.csv", index=False)

print("✅ submission_final.csv siap upload")


  df = df.groupby("lokasi_clean", group_keys=False).apply(reindex_daily)
  df = df.groupby("lokasi_clean", group_keys=False).apply(create_features)


Best threshold: [0.49487179487179483, 0.49487179487179483, 0.23846153846153847]
Valid Macro-F1: 0.845020814656564
✅ submission_final.csv siap upload
